diff --git a/dapo_lora_7b_20251202_002719/checkpoint-192/trainer_state.json b/dapo_lora_7b_20251202_002719/checkpoint-192/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..67737072f97d4cd7de1802b35a96280c775c5059
--- /dev/null
+++ b/dapo_lora_7b_20251202_002719/checkpoint-192/trainer_state.json
@@ -0,0 +1,5986 @@
+{
+  "best_global_step": null,
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 0.08831646734130635,
+  "eval_steps": 500,
+  "global_step": 192,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 16233.0,
+      "completions/max_terminated_length": 16233.0,
+      "completions/mean_length": 5701.859375,
+      "completions/mean_terminated_length": 5701.859375,
+      "completions/min_length": 630.0,
+      "completions/min_terminated_length": 630.0,
+      "entropy": 0.35103847086429596,
+      "epoch": 0.00045998160073597056,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.0027150087989866734,
+      "learning_rate": 1e-05,
+      "loss": 0.0764,
+      "num_tokens": 372903.0,
+      "reward": 0.71875,
+      "reward_std": 0.4581822156906128,
+      "rewards/accuracy_reward/mean": 0.71875,
+      "rewards/accuracy_reward/std": 0.4531635046005249,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000617504119873,
+      "sampling/importance_sampling_ratio/min": 0.2750210464000702,
+      "sampling/sampling_logp_difference/max": 1.290907621383667,
+      "sampling/sampling_logp_difference/mean": 0.01358163170516491,
+      "step": 1
+    },
+    {
+      "clip_ratio/high_max": 0.00010992912939400412,
+      "clip_ratio/high_mean": 2.748228234850103e-05,
+      "clip_ratio/low_mean": 0.00016060493635450257,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0001880872223409824,
+      "completions/clipped_ratio": 0.09375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15933.0,
+      "completions/mean_length": 7385.90625,
+      "completions/mean_terminated_length": 6455.06884765625,
+      "completions/min_length": 393.0,
+      "completions/min_terminated_length": 393.0,
+      "entropy": 0.5675897598266602,
+      "epoch": 0.0009199632014719411,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0007440462941303849,
+      "learning_rate": 1e-05,
+      "loss": -0.0152,
+      "num_tokens": 856873.0,
+      "reward": 0.390625,
+      "reward_std": 0.2198973000049591,
+      "rewards/accuracy_reward/mean": 0.390625,
+      "rewards/accuracy_reward/std": 0.4917473793029785,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999367594718933,
+      "sampling/importance_sampling_ratio/min": 0.009396589361131191,
+      "sampling/sampling_logp_difference/max": 4.667408466339111,
+      "sampling/sampling_logp_difference/mean": 0.022290317341685295,
+      "step": 2
+    },
+    {
+      "clip_ratio/high_max": 0.00018680206630961038,
+      "clip_ratio/high_mean": 7.093910403455084e-05,
+      "clip_ratio/low_mean": 0.0002504906224203296,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.00032142972168003325,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15091.0,
+      "completions/mean_length": 5608.828125,
+      "completions/mean_terminated_length": 5437.7939453125,
+      "completions/min_length": 936.0,
+      "completions/min_terminated_length": 936.0,
+      "entropy": 0.44635456055402756,
+      "epoch": 0.0013799448022079118,
+      "frac_reward_zero_std": 0.125,
+      "grad_norm": 0.002476191846653819,
+      "learning_rate": 1e-05,
+      "loss": 0.0755,
+      "num_tokens": 1225782.0,
+      "reward": 0.578125,
+      "reward_std": 0.3776973485946655,
+      "rewards/accuracy_reward/mean": 0.578125,
+      "rewards/accuracy_reward/std": 0.49776285886764526,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999972581863403,
+      "sampling/importance_sampling_ratio/min": 0.16118201613426208,
+      "sampling/sampling_logp_difference/max": 1.825221061706543,
+      "sampling/sampling_logp_difference/mean": 0.017525848001241684,
+      "step": 3
+    },
+    {
+      "clip_ratio/high_max": 0.0002787337944027968,
+      "clip_ratio/high_mean": 8.000510115380166e-05,
+      "clip_ratio/low_mean": 0.00027736531956179533,
+      "clip_ratio/low_min": 2.338634294574149e-05,
+      "clip_ratio/region_mean": 0.0003573704316295334,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 14515.0,
+      "completions/max_terminated_length": 14515.0,
+      "completions/mean_length": 3346.078125,
+      "completions/mean_terminated_length": 3346.078125,
+      "completions/min_length": 793.0,
+      "completions/min_terminated_length": 793.0,
+      "entropy": 0.545745424926281,
+      "epoch": 0.0018399264029438822,
+      "frac_reward_zero_std": 0.125,
+      "grad_norm": 0.0037713816855102777,
+      "learning_rate": 1e-05,
+      "loss": 0.0655,
+      "num_tokens": 1453315.0,
+      "reward": 0.4375,
+      "reward_std": 0.4413174092769623,
+      "rewards/accuracy_reward/mean": 0.4375,
+      "rewards/accuracy_reward/std": 0.5,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000426769256592,
+      "sampling/importance_sampling_ratio/min": 0.08748604357242584,
+      "sampling/sampling_logp_difference/max": 2.4362759590148926,
+      "sampling/sampling_logp_difference/mean": 0.016878074035048485,
+      "step": 4
+    },
+    {
+      "clip_ratio/high_max": 0.0002736507922236342,
+      "clip_ratio/high_mean": 0.00012070279444742482,
+      "clip_ratio/low_mean": 0.00037263989906932693,
+      "clip_ratio/low_min": 7.880559132900089e-05,
+      "clip_ratio/region_mean": 0.0004933426898787729,
+      "completions/clipped_ratio": 0.203125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15992.0,
+      "completions/mean_length": 7791.578125,
+      "completions/mean_terminated_length": 5601.35302734375,
+      "completions/min_length": 788.0,
+      "completions/min_terminated_length": 788.0,
+      "entropy": 0.4527555741369724,
+      "epoch": 0.0022999080036798527,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.0019191562896594405,
+      "learning_rate": 1e-05,
+      "loss": 0.066,
+      "num_tokens": 1962144.0,
+      "reward": 0.484375,
+      "reward_std": 0.4987064301967621,
+      "rewards/accuracy_reward/mean": 0.484375,
+      "rewards/accuracy_reward/std": 0.5037065148353577,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000364780426025,
+      "sampling/importance_sampling_ratio/min": 0.09914527088403702,
+      "sampling/sampling_logp_difference/max": 2.311169147491455,
+      "sampling/sampling_logp_difference/mean": 0.019328925758600235,
+      "step": 5
+    },
+    {
+      "clip_ratio/high_max": 0.000247960046181106,
+      "clip_ratio/high_mean": 6.500758581751143e-05,
+      "clip_ratio/low_mean": 8.249791471826029e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.00014750550326425582,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15770.0,
+      "completions/mean_length": 4726.546875,
+      "completions/mean_terminated_length": 4350.5,
+      "completions/min_length": 757.0,
+      "completions/min_terminated_length": 757.0,
+      "entropy": 0.5126069597899914,
+      "epoch": 0.0027598896044158236,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.002579454565420747,
+      "learning_rate": 1e-05,
+      "loss": -0.0359,
+      "num_tokens": 2273043.0,
+      "reward": 0.484375,
+      "reward_std": 0.28930896520614624,
+      "rewards/accuracy_reward/mean": 0.484375,
+      "rewards/accuracy_reward/std": 0.5037065148353577,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999161958694458,
+      "sampling/importance_sampling_ratio/min": 0.0002888332528527826,
+      "sampling/sampling_logp_difference/max": 8.14966106414795,
+      "sampling/sampling_logp_difference/mean": 0.01803017407655716,
+      "step": 6
+    },
+    {
+      "clip_ratio/high_max": 0.00017989838943321956,
+      "clip_ratio/high_mean": 6.093144725127786e-05,
+      "clip_ratio/low_mean": 0.00028579145509866066,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0003467229043963016,
+      "completions/clipped_ratio": 0.09375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 12743.0,
+      "completions/mean_length": 7409.0625,
+      "completions/mean_terminated_length": 6480.62060546875,
+      "completions/min_length": 879.0,
+      "completions/min_terminated_length": 879.0,
+      "entropy": 0.494194608181715,
+      "epoch": 0.003219871205151794,
+      "frac_reward_zero_std": 0.125,
+      "grad_norm": 0.002430765191093087,
+      "learning_rate": 1e-05,
+      "loss": 0.0822,
+      "num_tokens": 2757655.0,
+      "reward": 0.46875,
+      "reward_std": 0.40715816617012024,
+      "rewards/accuracy_reward/mean": 0.46875,
+      "rewards/accuracy_reward/std": 0.5029674172401428,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999563694000244,
+      "sampling/importance_sampling_ratio/min": 0.17787444591522217,
+      "sampling/sampling_logp_difference/max": 1.726677417755127,
+      "sampling/sampling_logp_difference/mean": 0.019815418869256973,
+      "step": 7
+    },
+    {
+      "clip_ratio/high_max": 0.00017167176974908216,
+      "clip_ratio/high_mean": 6.041262804501457e-05,
+      "clip_ratio/low_mean": 0.0002822945152729517,
+      "clip_ratio/low_min": 5.028157829656266e-05,
+      "clip_ratio/region_mean": 0.00034270713513251394,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 13078.0,
+      "completions/mean_length": 4700.203125,
+      "completions/mean_terminated_length": 4323.30615234375,
+      "completions/min_length": 503.0,
+      "completions/min_terminated_length": 503.0,
+      "entropy": 0.39490213245153427,
+      "epoch": 0.0036798528058877645,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.0022012051194906235,
+      "learning_rate": 1e-05,
+      "loss": 0.0811,
+      "num_tokens": 3072436.0,
+      "reward": 0.609375,
+      "reward_std": 0.49446311593055725,
+      "rewards/accuracy_reward/mean": 0.609375,
+      "rewards/accuracy_reward/std": 0.4917473793029785,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998409152030945,
+      "sampling/importance_sampling_ratio/min": 0.06603337824344635,
+      "sampling/sampling_logp_difference/max": 2.717594861984253,
+      "sampling/sampling_logp_difference/mean": 0.016631681472063065,
+      "step": 8
+    },
+    {
+      "clip_ratio/high_max": 0.00013108045459375717,
+      "clip_ratio/high_mean": 4.318108904044493e-05,
+      "clip_ratio/low_mean": 0.00023819861780793872,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0002813797018461628,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15940.0,
+      "completions/mean_length": 5188.890625,
+      "completions/mean_terminated_length": 4827.7578125,
+      "completions/min_length": 790.0,
+      "completions/min_terminated_length": 790.0,
+      "entropy": 0.43566014245152473,
+      "epoch": 0.004139834406623735,
+      "frac_reward_zero_std": 0.125,
+      "grad_norm": 0.0016241734847426414,
+      "learning_rate": 1e-05,
+      "loss": 0.0119,
+      "num_tokens": 3414085.0,
+      "reward": 0.59375,
+      "reward_std": 0.39820659160614014,
+      "rewards/accuracy_reward/mean": 0.59375,
+      "rewards/accuracy_reward/std": 0.49501484632492065,
+      "sampling/importance_sampling_ratio/max": 1.9456381797790527,
+      "sampling/importance_sampling_ratio/mean": 1.0000399351119995,
+      "sampling/importance_sampling_ratio/min": 0.10360148549079895,
+      "sampling/sampling_logp_difference/max": 2.2672035694122314,
+      "sampling/sampling_logp_difference/mean": 0.01550372689962387,
+      "step": 9
+    },
+    {
+      "clip_ratio/high_max": 0.00010115922304976266,
+      "clip_ratio/high_mean": 2.5289805762440665e-05,
+      "clip_ratio/low_mean": 0.00034295484147151,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0003682446440507192,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15217.0,
+      "completions/mean_length": 5832.875,
+      "completions/mean_terminated_length": 5492.51611328125,
+      "completions/min_length": 717.0,
+      "completions/min_terminated_length": 717.0,
+      "entropy": 0.600818321108818,
+      "epoch": 0.004599816007359705,
+      "frac_reward_zero_std": 0.125,
+      "grad_norm": 0.0010776554699987173,
+      "learning_rate": 1e-05,
+      "loss": -0.0314,
+      "num_tokens": 3798397.0,
+      "reward": 0.328125,
+      "reward_std": 0.37298911809921265,
+      "rewards/accuracy_reward/mean": 0.328125,
+      "rewards/accuracy_reward/std": 0.4732423722743988,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999531507492065,
+      "sampling/importance_sampling_ratio/min": 0.0953303873538971,
+      "sampling/sampling_logp_difference/max": 2.3504066467285156,
+      "sampling/sampling_logp_difference/mean": 0.020683372393250465,
+      "step": 10
+    },
+    {
+      "clip_ratio/high_max": 0.00030824893383396557,
+      "clip_ratio/high_mean": 0.00011632417340479151,
+      "clip_ratio/low_mean": 0.0002341717704439361,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0003504959422571119,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15746.0,
+      "completions/max_terminated_length": 15746.0,
+      "completions/mean_length": 4986.171875,
+      "completions/mean_terminated_length": 4986.171875,
+      "completions/min_length": 550.0,
+      "completions/min_terminated_length": 550.0,
+      "entropy": 0.40387310832738876,
+      "epoch": 0.005059797608095676,
+      "frac_reward_zero_std": 0.125,
+      "grad_norm": 0.003584277583286166,
+      "learning_rate": 1e-05,
+      "loss": 0.0011,
+      "num_tokens": 4127424.0,
+      "reward": 0.671875,
+      "reward_std": 0.4434390664100647,
+      "rewards/accuracy_reward/mean": 0.671875,
+      "rewards/accuracy_reward/std": 0.4732423722743988,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998393654823303,
+      "sampling/importance_sampling_ratio/min": 0.02629905194044113,
+      "sampling/sampling_logp_difference/max": 3.6382224559783936,
+      "sampling/sampling_logp_difference/mean": 0.01555373053997755,
+      "step": 11
+    },
+    {
+      "clip_ratio/high_max": 0.00013135069002601085,
+      "clip_ratio/high_mean": 4.189404148746689e-05,
+      "clip_ratio/low_mean": 0.00014246321052269195,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.00018435725178278517,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 10079.0,
+      "completions/max_terminated_length": 10079.0,
+      "completions/mean_length": 3880.515625,
+      "completions/mean_terminated_length": 3880.515625,
+      "completions/min_length": 674.0,
+      "completions/min_terminated_length": 674.0,
+      "entropy": 0.4064784087240696,
+      "epoch": 0.005519779208831647,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0017852422315627337,
+      "learning_rate": 1e-05,
+      "loss": 0.0198,
+      "num_tokens": 4384473.0,
+      "reward": 0.671875,
+      "reward_std": 0.2867126166820526,
+      "rewards/accuracy_reward/mean": 0.671875,
+      "rewards/accuracy_reward/std": 0.4732423722743988,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999723434448242,
+      "sampling/importance_sampling_ratio/min": 0.37220701575279236,
+      "sampling/sampling_logp_difference/max": 0.9883050918579102,
+      "sampling/sampling_logp_difference/mean": 0.013887828215956688,
+      "step": 12
+    },
+    {
+      "clip_ratio/high_max": 0.00014981444019213086,
+      "clip_ratio/high_mean": 4.5794572770319064e-05,
+      "clip_ratio/low_mean": 0.00040218312869910733,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.00044797768418902706,
+      "completions/clipped_ratio": 0.140625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16341.0,
+      "completions/mean_length": 8894.578125,
+      "completions/mean_terminated_length": 7669.0361328125,
+      "completions/min_length": 1085.0,
+      "completions/min_terminated_length": 1085.0,
+      "entropy": 0.5499315299093723,
+      "epoch": 0.005979760809567618,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.004000168293714523,
+      "learning_rate": 1e-05,
+      "loss": 0.0373,
+      "num_tokens": 4963350.0,
+      "reward": 0.390625,
+      "reward_std": 0.2824692726135254,
+      "rewards/accuracy_reward/mean": 0.390625,
+      "rewards/accuracy_reward/std": 0.4917473793029785,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999991774559021,
+      "sampling/importance_sampling_ratio/min": 0.047493718564510345,
+      "sampling/sampling_logp_difference/max": 3.0471577644348145,
+      "sampling/sampling_logp_difference/mean": 0.02204228937625885,
+      "step": 13
+    },
+    {
+      "clip_ratio/high_max": 0.00018746273144643055,
+      "clip_ratio/high_mean": 5.583179722634668e-05,
+      "clip_ratio/low_mean": 0.0001284618601857801,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0001842936590037425,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 12058.0,
+      "completions/max_terminated_length": 12058.0,
+      "completions/mean_length": 4584.0625,
+      "completions/mean_terminated_length": 4584.0625,
+      "completions/min_length": 462.0,
+      "completions/min_terminated_length": 462.0,
+      "entropy": 0.4566480815410614,
+      "epoch": 0.006439742410303588,
+      "frac_reward_zero_std": 0.125,
+      "grad_norm": 0.003257408272475004,
+      "learning_rate": 1e-05,
+      "loss": -0.0342,
+      "num_tokens": 5266274.0,
+      "reward": 0.671875,
+      "reward_std": 0.3751009702682495,
+      "rewards/accuracy_reward/mean": 0.671875,
+      "rewards/accuracy_reward/std": 0.4732423722743988,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999750256538391,
+      "sampling/importance_sampling_ratio/min": 0.39602163434028625,
+      "sampling/sampling_logp_difference/max": 0.9262864589691162,
+      "sampling/sampling_logp_difference/mean": 0.01598881185054779,
+      "step": 14
+    },
+    {
+      "clip_ratio/high_max": 0.00015991039845175692,
+      "clip_ratio/high_mean": 5.3697508178629505e-05,
+      "clip_ratio/low_mean": 0.0003120610426776693,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.00036575855119735934,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15009.0,
+      "completions/mean_length": 5134.671875,
+      "completions/mean_terminated_length": 4581.42578125,
+      "completions/min_length": 484.0,
+      "completions/min_terminated_length": 484.0,
+      "entropy": 0.41497115045785904,
+      "epoch": 0.0068997240110395585,
+      "frac_reward_zero_std": 0.125,
+      "grad_norm": 0.004677772056311369,
+      "learning_rate": 1e-05,
+      "loss": 0.05,
+      "num_tokens": 5603925.0,
+      "reward": 0.640625,
+      "reward_std": 0.3913571238517761,
+      "rewards/accuracy_reward/mean": 0.640625,
+      "rewards/accuracy_reward/std": 0.4836103618144989,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0001789331436157,
+      "sampling/importance_sampling_ratio/min": 0.07364130765199661,
+      "sampling/sampling_logp_difference/max": 2.608549118041992,
+      "sampling/sampling_logp_difference/mean": 0.016165096312761307,
+      "step": 15
+    },
+    {
+      "clip_ratio/high_max": 0.00025949142946046777,
+      "clip_ratio/high_mean": 9.68364292930346e-05,
+      "clip_ratio/low_mean": 0.000282365266684792,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.000379201697796816,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15173.0,
+      "completions/max_terminated_length": 15173.0,
+      "completions/mean_length": 4904.96875,
+      "completions/mean_terminated_length": 4904.96875,
+      "completions/min_length": 378.0,
+      "completions/min_terminated_length": 378.0,
+      "entropy": 0.4841916747391224,
+      "epoch": 0.007359705611775529,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.002402309561148286,
+      "learning_rate": 1e-05,
+      "loss": 0.0633,
+      "num_tokens": 5928091.0,
+      "reward": 0.484375,
+      "reward_std": 0.41246524453163147,
+      "rewards/accuracy_reward/mean": 0.484375,
+      "rewards/accuracy_reward/std": 0.5037065148353577,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999504685401917,
+      "sampling/importance_sampling_ratio/min": 0.0037722671404480934,
+      "sampling/sampling_logp_difference/max": 5.580079078674316,
+      "sampling/sampling_logp_difference/mean": 0.018390391021966934,
+      "step": 16
+    },
+    {
+      "clip_ratio/high_max": 6.219606439117342e-05,
+      "clip_ratio/high_mean": 1.5549016097793356e-05,
+      "clip_ratio/low_mean": 0.00019023374534299364,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0002057827605312923,
+      "completions/clipped_ratio": 0.109375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 13837.0,
+      "completions/mean_length": 5209.84375,
+      "completions/mean_terminated_length": 3837.578857421875,
+      "completions/min_length": 126.0,
+      "completions/min_terminated_length": 126.0,
+      "entropy": 0.3513585068285465,
+      "epoch": 0.0078196872125115,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0019373978720977902,
+      "learning_rate": 1e-05,
+      "loss": 0.0016,
+      "num_tokens": 6271057.0,
+      "reward": 0.453125,
+      "reward_std": 0.3403330445289612,
+      "rewards/accuracy_reward/mean": 0.453125,
+      "rewards/accuracy_reward/std": 0.501733124256134,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999862015247345,
+      "sampling/importance_sampling_ratio/min": 0.1450539529323578,
+      "sampling/sampling_logp_difference/max": 1.9306495189666748,
+      "sampling/sampling_logp_difference/mean": 0.013681268319487572,
+      "step": 17
+    },
+    {
+      "clip_ratio/high_max": 0.0001431612308806507,
+      "clip_ratio/high_mean": 4.711323526862543e-05,
+      "clip_ratio/low_mean": 9.270217788071022e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0001398154154230724,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 9328.0,
+      "completions/max_terminated_length": 9328.0,
+      "completions/mean_length": 2520.640625,
+      "completions/mean_terminated_length": 2520.640625,
+      "completions/min_length": 416.0,
+      "completions/min_terminated_length": 416.0,
+      "entropy": 0.36302734911441803,
+      "epoch": 0.00827966881324747,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0027223003562539816,
+      "learning_rate": 1e-05,
+      "loss": -0.0416,
+      "num_tokens": 6441562.0,
+      "reward": 0.65625,
+      "reward_std": 0.33090677857398987,
+      "rewards/accuracy_reward/mean": 0.65625,
+      "rewards/accuracy_reward/std": 0.4787135720252991,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.000017762184143,
+      "sampling/importance_sampling_ratio/min": 0.3734391927719116,
+      "sampling/sampling_logp_difference/max": 0.9850001335144043,
+      "sampling/sampling_logp_difference/mean": 0.011676793918013573,
+      "step": 18
+    },
+    {
+      "clip_ratio/high_max": 0.00017718410344969016,
+      "clip_ratio/high_mean": 5.833459545101505e-05,
+      "clip_ratio/low_mean": 0.0002528423356125131,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.00031117693106352817,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15108.0,
+      "completions/mean_length": 4240.96875,
+      "completions/mean_terminated_length": 4048.222412109375,
+      "completions/min_length": 310.0,
+      "completions/min_terminated_length": 310.0,
+      "entropy": 0.3896213509142399,
+      "epoch": 0.008739650413983441,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.002503112656995654,
+      "learning_rate": 1e-05,
+      "loss": 0.0739,
+      "num_tokens": 6721568.0,
+      "reward": 0.59375,
+      "reward_std": 0.4991811513900757,
+      "rewards/accuracy_reward/mean": 0.59375,
+      "rewards/accuracy_reward/std": 0.49501484632492065,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999947547912598,
+      "sampling/importance_sampling_ratio/min": 0.10363919287919998,
+      "sampling/sampling_logp_difference/max": 2.2668397426605225,
+      "sampling/sampling_logp_difference/mean": 0.014314994215965271,
+      "step": 19
+    },
+    {
+      "clip_ratio/high_max": 0.0002049997847279883,
+      "clip_ratio/high_mean": 6.95637043008901e-05,
+      "clip_ratio/low_mean": 0.00011690972041833447,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.00018647342039912473,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15322.0,
+      "completions/mean_length": 3738.484375,
+      "completions/mean_terminated_length": 3116.573486328125,
+      "completions/min_length": 367.0,
+      "completions/min_terminated_length": 367.0,
+      "entropy": 0.29045598581433296,
+      "epoch": 0.00919963201471941,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.002947593806311488,
+      "learning_rate": 1e-05,
+      "loss": 0.0411,
+      "num_tokens": 6969399.0,
+      "reward": 0.8125,
+      "reward_std": 0.23356688022613525,
+      "rewards/accuracy_reward/mean": 0.8125,
+      "rewards/accuracy_reward/std": 0.39339789748191833,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998925924301147,
+      "sampling/importance_sampling_ratio/min": 0.11472277343273163,
+      "sampling/sampling_logp_difference/max": 2.165236711502075,
+      "sampling/sampling_logp_difference/mean": 0.011310569941997528,
+      "step": 20
+    },
+    {
+      "clip_ratio/high_max": 0.00010545731220190646,
+      "clip_ratio/high_mean": 3.014280719071394e-05,
+      "clip_ratio/low_mean": 0.00011199774735359824,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.00014214055443062534,
+      "completions/clipped_ratio": 0.078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15392.0,
+      "completions/mean_length": 6065.90625,
+      "completions/mean_terminated_length": 5191.49169921875,
+      "completions/min_length": 235.0,
+      "completions/min_terminated_length": 235.0,
+      "entropy": 0.44125597178936005,
+      "epoch": 0.009659613615455382,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0011246009962633252,
+      "learning_rate": 1e-05,
+      "loss": 0.0021,
+      "num_tokens": 7365937.0,
+      "reward": 0.421875,
+      "reward_std": 0.23144522309303284,
+      "rewards/accuracy_reward/mean": 0.421875,
+      "rewards/accuracy_reward/std": 0.49776285886764526,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000147819519043,
+      "sampling/importance_sampling_ratio/min": 0.25809481739997864,
+      "sampling/sampling_logp_difference/max": 1.3544282913208008,
+      "sampling/sampling_logp_difference/mean": 0.017348822206258774,
+      "step": 21
+    },
+    {
+      "clip_ratio/high_max": 0.0003601935495680664,
+      "clip_ratio/high_mean": 9.941099415300414e-05,
+      "clip_ratio/low_mean": 0.00034870224044425413,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0004481132409637212,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 10951.0,
+      "completions/mean_length": 3722.015625,
+      "completions/mean_terminated_length": 3521.031982421875,
+      "completions/min_length": 183.0,
+      "completions/min_terminated_length": 183.0,
+      "entropy": 0.4340820461511612,
+      "epoch": 0.010119595216191352,
+      "frac_reward_zero_std": 0.125,
+      "grad_norm": 0.001601650146767497,
+      "learning_rate": 1e-05,
+      "loss": 0.0015,
+      "num_tokens": 7615658.0,
+      "reward": 0.5,
+      "reward_std": 0.3913668990135193,
+      "rewards/accuracy_reward/mean": 0.5,
+      "rewards/accuracy_reward/std": 0.5039526224136353,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998630285263062,
+      "sampling/importance_sampling_ratio/min": 1.3064802715234691e-06,
+      "sampling/sampling_logp_difference/max": 13.548173904418945,
+      "sampling/sampling_logp_difference/mean": 0.016604293137788773,
+      "step": 22
+    },
+    {
+      "clip_ratio/high_max": 0.0002349931419303175,
+      "clip_ratio/high_mean": 6.471897268056637e-05,
+      "clip_ratio/low_mean": 0.00014105365880823229,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.00020577262966980925,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15141.0,
+      "completions/max_terminated_length": 15141.0,
+      "completions/mean_length": 3747.484375,
+      "completions/mean_terminated_length": 3747.484375,
+      "completions/min_length": 329.0,
+      "completions/min_terminated_length": 329.0,
+      "entropy": 0.43806017562747,
+      "epoch": 0.010579576816927323,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0017510901670902967,
+      "learning_rate": 1e-05,
+      "loss": -0.0391,
+      "num_tokens": 7867545.0,
+      "reward": 0.5625,
+      "reward_std": 0.22461533546447754,
+      "rewards/accuracy_reward/mean": 0.5625,
+      "rewards/accuracy_reward/std": 0.5,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000494718551636,
+      "sampling/importance_sampling_ratio/min": 0.1432838886976242,
+      "sampling/sampling_logp_difference/max": 1.942927360534668,
+      "sampling/sampling_logp_difference/mean": 0.015971330925822258,
+      "step": 23
+    },
+    {
+      "clip_ratio/high_max": 0.0002638470396050252,
+      "clip_ratio/high_mean": 8.973176045401487e-05,
+      "clip_ratio/low_mean": 0.0001654990855968208,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0002552308424128569,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15089.0,
+      "completions/mean_length": 4868.609375,
+      "completions/mean_terminated_length": 4685.82568359375,
+      "completions/min_length": 1304.0,
+      "completions/min_terminated_length": 1304.0,
+      "entropy": 0.3689058944582939,
+      "epoch": 0.011039558417663294,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0025512739084661007,
+      "learning_rate": 1e-05,
+      "loss": 0.0702,
+      "num_tokens": 8187720.0,
+      "reward": 0.625,
+      "reward_std": 0.35824596881866455,
+      "rewards/accuracy_reward/mean": 0.625,
+      "rewards/accuracy_reward/std": 0.48795005679130554,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999942779541016,
+      "sampling/importance_sampling_ratio/min": 0.21243424713611603,
+      "sampling/sampling_logp_difference/max": 1.5491228103637695,
+      "sampling/sampling_logp_difference/mean": 0.01530374214053154,
+      "step": 24
+    },
+    {
+      "clip_ratio/high_max": 0.00016221465284615988,
+      "clip_ratio/high_mean": 5.93273357480939e-05,
+      "clip_ratio/low_mean": 0.0003561860394256655,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.00041551337380951736,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16105.0,
+      "completions/mean_length": 7169.59375,
+      "completions/mean_terminated_length": 7023.33349609375,
+      "completions/min_length": 590.0,
+      "completions/min_terminated_length": 590.0,
+      "entropy": 0.5559867396950722,
+      "epoch": 0.011499540018399264,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0009040784207172692,
+      "learning_rate": 1e-05,
+      "loss": 0.0516,
+      "num_tokens": 8657286.0,
+      "reward": 0.328125,
+      "reward_std": 0.2414703518152237,
+      "rewards/accuracy_reward/mean": 0.328125,
+      "rewards/accuracy_reward/std": 0.4732423722743988,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.000077247619629,
+      "sampling/importance_sampling_ratio/min": 0.244469553232193,
+      "sampling/sampling_logp_difference/max": 1.4086644649505615,
+      "sampling/sampling_logp_difference/mean": 0.021266434341669083,
+      "step": 25
+    },
+    {
+      "clip_ratio/high_max": 0.0001577084094606107,
+      "clip_ratio/high_mean": 4.298096519050887e-05,
+      "clip_ratio/low_mean": 0.00013108373877912527,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0001740647035148868,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15311.0,
+      "completions/mean_length": 6734.921875,
+      "completions/mean_terminated_length": 6091.650390625,
+      "completions/min_length": 812.0,
+      "completions/min_terminated_length": 812.0,
+      "entropy": 0.44154683500528336,
+      "epoch": 0.011959521619135235,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.002558791544288397,
+      "learning_rate": 1e-05,
+      "loss": 0.0372,
+      "num_tokens": 9099577.0,
+      "reward": 0.515625,
+      "reward_std": 0.2777610719203949,
+      "rewards/accuracy_reward/mean": 0.515625,
+      "rewards/accuracy_reward/std": 0.5037065148353577,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999955296516418,
+      "sampling/importance_sampling_ratio/min": 0.077813521027565,
+      "sampling/sampling_logp_difference/max": 2.5534400939941406,
+      "sampling/sampling_logp_difference/mean": 0.0186590775847435,
+      "step": 26
+    },
+    {
+      "clip_ratio/high_max": 0.00014542990538757294,
+      "clip_ratio/high_mean": 3.6357476346893236e-05,
+      "clip_ratio/low_mean": 0.00021458245646499563,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.00025093993099289946,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15505.0,
+      "completions/mean_length": 4848.078125,
+      "completions/mean_terminated_length": 4475.95166015625,
+      "completions/min_length": 856.0,
+      "completions/min_terminated_length": 856.0,
+      "entropy": 0.4912428632378578,
+      "epoch": 0.012419503219871205,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0017661909805610776,
+      "learning_rate": 1e-05,
+      "loss": 0.0957,
+      "num_tokens": 9420006.0,
+      "reward": 0.515625,
+      "reward_std": 0.3403330445289612,
+      "rewards/accuracy_reward/mean": 0.515625,
+      "rewards/accuracy_reward/std": 0.5037065148353577,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.000015139579773,
+      "sampling/importance_sampling_ratio/min": 0.14381231367588043,
+      "sampling/sampling_logp_difference/max": 1.9392461776733398,
+      "sampling/sampling_logp_difference/mean": 0.017206422984600067,
+      "step": 27
+    },
+    {
+      "clip_ratio/high_max": 0.00031798147210793104,
+      "clip_ratio/high_mean": 0.00010812525488290703,
+      "clip_ratio/low_mean": 0.00021282920124576776,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.00032095445021695923,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15477.0,
+      "completions/mean_length": 5689.8125,
+      "completions/mean_terminated_length": 5163.86865234375,
+      "completions/min_length": 407.0,
+      "completions/min_terminated_length": 407.0,
+      "entropy": 0.4508574977517128,
+      "epoch": 0.012879484820607176,
+      "frac_reward_zero_std": 0.125,
+      "grad_norm": 0.0030540244188159704,
+      "learning_rate": 1e-05,
+      "loss": 0.0809,
+      "num_tokens": 9793746.0,
+      "reward": 0.53125,
+      "reward_std": 0.42552614212036133,
+      "rewards/accuracy_reward/mean": 0.53125,
+      "rewards/accuracy_reward/std": 0.5029674172401428,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999897480010986,
+      "sampling/importance_sampling_ratio/min": 8.414050967076037e-07,
+      "sampling/sampling_logp_difference/max": 13.988192558288574,
+      "sampling/sampling_logp_difference/mean": 0.016547517850995064,
+      "step": 28
+    },
+    {
+      "clip_ratio/high_max": 0.00019940425045206212,
+      "clip_ratio/high_mean": 5.6281104662048165e-05,
+      "clip_ratio/low_mean": 0.00010776506042020628,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.00016404616417275975,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 14758.0,
+      "completions/max_terminated_length": 14758.0,
+      "completions/mean_length": 3069.78125,
+      "completions/mean_terminated_length": 3069.78125,
+      "completions/min_length": 415.0,
+      "completions/min_terminated_length": 415.0,
+      "entropy": 0.39274851977825165,
+      "epoch": 0.013339466421343146,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0034625211264938116,
+      "learning_rate": 1e-05,
+      "loss": 0.0146,
+      "num_tokens": 10000348.0,
+      "reward": 0.546875,
+      "reward_std": 0.2777610421180725,
+      "rewards/accuracy_reward/mean": 0.546875,
+      "rewards/accuracy_reward/std": 0.501733124256134,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.000138282775879,
+      "sampling/importance_sampling_ratio/min": 0.32597410678863525,
+      "sampling/sampling_logp_difference/max": 1.1209373474121094,
+      "sampling/sampling_logp_difference/mean": 0.014218954369425774,
+      "step": 29
+    },
+    {
+      "clip_ratio/high_max": 0.00012761429206875619,
+      "clip_ratio/high_mean": 4.307139124648529e-05,
+      "clip_ratio/low_mean": 0.00010018590637628222,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.00014325729807751486,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16360.0,
+      "completions/mean_length": 5308.3125,
+      "completions/mean_terminated_length": 4763.6064453125,
+      "completions/min_length": 550.0,
+      "completions/min_terminated_length": 550.0,
+      "entropy": 0.50441013276577,
+      "epoch": 0.013799448022079117,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.00156789505854249,
+      "learning_rate": 1e-05,
+      "loss": 0.0046,
+      "num_tokens": 10350440.0,
+      "reward": 0.515625,
+      "reward_std": 0.2109457552433014,
+      "rewards/accuracy_reward/mean": 0.515625,
+      "rewards/accuracy_reward/std": 0.5037065148353577,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000964403152466,
+      "sampling/importance_sampling_ratio/min": 0.04705130681395531,
+      "sampling/sampling_logp_difference/max": 3.056516647338867,
+      "sampling/sampling_logp_difference/mean": 0.019430290907621384,
+      "step": 30
+    },
+    {
+      "clip_ratio/high_max": 0.00016632911138003692,
+      "clip_ratio/high_mean": 5.557040094572585e-05,
+      "clip_ratio/low_mean": 0.0002778837697405834,
+      "clip_ratio/low_min": 1.6620682799839415e-05,
+      "clip_ratio/region_mean": 0.00033345417978125624,
+      "completions/clipped_ratio": 0.09375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15490.0,
+      "completions/mean_length": 6388.265625,
+      "completions/mean_terminated_length": 5354.22412109375,
+      "completions/min_length": 722.0,
+      "completions/min_terminated_length": 722.0,
+      "entropy": 0.5342313349246979,
+      "epoch": 0.014259429622815088,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0026365246158093214,
+      "learning_rate": 1e-05,
+      "loss": 0.0118,
+      "num_tokens": 10768153.0,
+      "reward": 0.359375,
+      "reward_std": 0.31983357667922974,
+      "rewards/accuracy_reward/mean": 0.359375,
+      "rewards/accuracy_reward/std": 0.4836103618144989,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998627305030823,
+      "sampling/importance_sampling_ratio/min": 0.26772308349609375,
+      "sampling/sampling_logp_difference/max": 1.31780207157135,
+      "sampling/sampling_logp_difference/mean": 0.017920637503266335,
+      "step": 31
+    },
+    {
+      "clip_ratio/high_max": 0.00017989536627283087,
+      "clip_ratio/high_mean": 5.500852148543345e-05,
+      "clip_ratio/low_mean": 0.00012964008692506468,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.00018464860841049813,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14732.0,
+      "completions/mean_length": 5229.078125,
+      "completions/mean_terminated_length": 4869.24169921875,
+      "completions/min_length": 656.0,
+      "completions/min_terminated_length": 656.0,
+      "entropy": 0.38906631618738174,
+      "epoch": 0.014719411223551058,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0022169759031385183,
+      "learning_rate": 1e-05,
+      "loss": 0.0213,
+      "num_tokens": 11111918.0,
+      "reward": 0.765625,
+      "reward_std": 0.3629639744758606,
+      "rewards/accuracy_reward/mean": 0.765625,
+      "rewards/accuracy_reward/std": 0.42695629596710205,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999115467071533,
+      "sampling/importance_sampling_ratio/min": 0.08817384392023087,
+      "sampling/sampling_logp_difference/max": 2.4284448623657227,
+      "sampling/sampling_logp_difference/mean": 0.015222044661641121,
+      "step": 32
+    },
+    {
+      "clip_ratio/high_max": 0.00014480652316706255,
+      "clip_ratio/high_mean": 4.443957550392952e-05,
+      "clip_ratio/low_mean": 0.00012809812687919475,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.00017253770374736632,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14850.0,
+      "completions/mean_length": 5148.453125,
+      "completions/mean_terminated_length": 4786.01611328125,
+      "completions/min_length": 815.0,
+      "completions/min_terminated_length": 815.0,
+      "entropy": 0.5083456933498383,
+      "epoch": 0.01517939282428703,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.003128955839201808,
+      "learning_rate": 1e-05,
+      "loss": -0.0622,
+      "num_tokens": 11451323.0,
+      "reward": 0.53125,
+      "reward_std": 0.34034284949302673,
+      "rewards/accuracy_reward/mean": 0.53125,
+      "rewards/accuracy_reward/std": 0.5029674172401428,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.000025987625122,
+      "sampling/importance_sampling_ratio/min": 0.10359863191843033,
+      "sampling/sampling_logp_difference/max": 2.2672312259674072,
+      "sampling/sampling_logp_difference/mean": 0.017722681164741516,
+      "step": 33
+    },
+    {
+      "clip_ratio/high_max": 5.51352559341467e-05,
+      "clip_ratio/high_mean": 1.3783813983536675e-05,
+      "clip_ratio/low_mean": 7.914142133813584e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 9.292523554904619e-05,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 13048.0,
+      "completions/mean_length": 4609.5,
+      "completions/mean_terminated_length": 3824.533447265625,
+      "completions/min_length": 829.0,
+      "completions/min_terminated_length": 829.0,
+      "entropy": 0.49830054119229317,
+      "epoch": 0.015639374425023,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0007577431970275939,
+      "learning_rate": 1e-05,
+      "loss": 0.0132,
+      "num_tokens": 11758275.0,
+      "reward": 0.375,
+      "reward_std": 0.2041158676147461,
+      "rewards/accuracy_reward/mean": 0.375,
+      "rewards/accuracy_reward/std": 0.48795005679130554,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998879432678223,
+      "sampling/importance_sampling_ratio/min": 0.05370701104402542,
+      "sampling/sampling_logp_difference/max": 2.9242117404937744,
+      "sampling/sampling_logp_difference/mean": 0.01685405895113945,
+      "step": 34
+    },
+    {
+      "clip_ratio/high_max": 0.0001986039806070039,
+      "clip_ratio/high_mean": 6.727558275088086e-05,
+      "clip_ratio/low_mean": 0.0003367365798112587,
+      "clip_ratio/low_min": 6.28791003691731e-05,
+      "clip_ratio/region_mean": 0.000404012165745371,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14497.0,
+      "completions/mean_length": 4593.015625,
+      "completions/mean_terminated_length": 4013.130859375,
+      "completions/min_length": 1094.0,
+      "completions/min_terminated_length": 1094.0,
+      "entropy": 0.3128826189786196,
+      "epoch": 0.01609935602575897,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.0026802816428244114,
+      "learning_rate": 1e-05,
+      "loss": 0.1212,
+      "num_tokens": 12063516.0,
+      "reward": 0.625,
+      "reward_std": 0.49234145879745483,
+      "rewards/accuracy_reward/mean": 0.625,
+      "rewards/accuracy_reward/std": 0.48795005679130554,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999848008155823,
+      "sampling/importance_sampling_ratio/min": 0.0008915311773307621,
+      "sampling/sampling_logp_difference/max": 7.0225701332092285,
+      "sampling/sampling_logp_difference/mean": 0.01317686028778553,
+      "step": 35
+    },
+    {
+      "clip_ratio/high_max": 7.243978234328097e-05,
+      "clip_ratio/high_mean": 1.8109945585820242e-05,
+      "clip_ratio/low_mean": 9.390242212248268e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.00011201236907254497,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16161.0,
+      "completions/mean_length": 5015.171875,
+      "completions/mean_terminated_length": 4456.048828125,
+      "completions/min_length": 666.0,
+      "completions/min_terminated_length": 666.0,
+      "entropy": 0.37973257526755333,
+      "epoch": 0.01655933762649494,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.002345556626096368,
+      "learning_rate": 1e-05,
+      "loss": -0.0941,
+      "num_tokens": 12393103.0,
+      "reward": 0.640625,
+      "reward_std": 0.2688094973564148,
+      "rewards/accuracy_reward/mean": 0.640625,
+      "rewards/accuracy_reward/std": 0.4836103618144989,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000877380371094,
+      "sampling/importance_sampling_ratio/min": 0.1842055469751358,
+      "sampling/sampling_logp_difference/max": 1.6917030811309814,
+      "sampling/sampling_logp_difference/mean": 0.0145792867988348,
+      "step": 36
+    },
+    {
+      "clip_ratio/high_max": 0.00014789494525757618,
+      "clip_ratio/high_mean": 4.601037198881386e-05,
+      "clip_ratio/low_mean": 0.0003090670288656838,
+      "clip_ratio/low_min": 1.8808304957929067e-05,
+      "clip_ratio/region_mean": 0.00035507740903995,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15632.0,
+      "completions/mean_length": 5598.484375,
+      "completions/mean_terminated_length": 5068.048828125,
+      "completions/min_length": 1283.0,
+      "completions/min_terminated_length": 1283.0,
+      "entropy": 0.35928424820303917,
+      "epoch": 0.01701931922723091,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.0015618539182469249,
+      "learning_rate": 1e-05,
+      "loss": 0.0506,
+      "num_tokens": 12761230.0,
+      "reward": 0.546875,
+      "reward_std": 0.4240131676197052,
+      "rewards/accuracy_reward/mean": 0.546875,
+      "rewards/accuracy_reward/std": 0.501733124256134,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999818801879883,
+      "sampling/importance_sampling_ratio/min": 0.2261282205581665,
+      "sampling/sampling_logp_difference/max": 2.6031017303466797,
+      "sampling/sampling_logp_difference/mean": 0.01447785273194313,
+      "step": 37
+    },
+    {
+      "clip_ratio/high_max": 4.21205932070734e-05,
+      "clip_ratio/high_mean": 1.053014830176835e-05,
+      "clip_ratio/low_mean": 4.961071590514621e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 6.014086420691456e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16037.0,
+      "completions/mean_length": 5366.125,
+      "completions/mean_terminated_length": 4824.26220703125,
+      "completions/min_length": 324.0,
+      "completions/min_terminated_length": 324.0,
+      "entropy": 0.41980869323015213,
+      "epoch": 0.017479300827966882,
+      "frac_reward_zero_std": 0.625,
+      "grad_norm": 0.0011855819029733539,
+      "learning_rate": 1e-05,
+      "loss": 0.0588,
+      "num_tokens": 13115038.0,
+      "reward": 0.5,
+      "reward_std": 0.17570312321186066,
+      "rewards/accuracy_reward/mean": 0.5,
+      "rewards/accuracy_reward/std": 0.5039526224136353,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999375343322754,
+      "sampling/importance_sampling_ratio/min": 0.15887950360774994,
+      "sampling/sampling_logp_difference/max": 1.839609146118164,
+      "sampling/sampling_logp_difference/mean": 0.015550841577351093,
+      "step": 38
+    },
+    {
+      "clip_ratio/high_max": 0.0003506070097500924,
+      "clip_ratio/high_mean": 0.00010976320845657028,
+      "clip_ratio/low_mean": 0.0001256909990843269,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.00023545420481241308,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15317.0,
+      "completions/max_terminated_length": 15317.0,
+      "completions/mean_length": 3308.296875,
+      "completions/mean_terminated_length": 3308.296875,
+      "completions/min_length": 786.0,
+      "completions/min_terminated_length": 786.0,
+      "entropy": 0.38983067497611046,
+      "epoch": 0.017939282428702852,
+      "frac_reward_zero_std": 0.125,
+      "grad_norm": 0.0023375866003334522,
+      "learning_rate": 1e-05,
+      "loss": 0.0624,
+      "num_tokens": 13335329.0,
+      "reward": 0.59375,
+      "reward_std": 0.3913668990135193,
+      "rewards/accuracy_reward/mean": 0.59375,
+      "rewards/accuracy_reward/std": 0.49501484632492065,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.99998939037323,
+      "sampling/importance_sampling_ratio/min": 0.0030945157632231712,
+      "sampling/sampling_logp_difference/max": 5.77812385559082,
+      "sampling/sampling_logp_difference/mean": 0.013900299556553364,
+      "step": 39
+    },
+    {
+      "clip_ratio/high_max": 0.000169710889167618,
+      "clip_ratio/high_mean": 5.673388113791589e-05,
+      "clip_ratio/low_mean": 0.00029868036835978273,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.000355414251316688,
+      "completions/clipped_ratio": 0.078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15761.0,
+      "completions/mean_length": 5426.078125,
+      "completions/mean_terminated_length": 4497.44091796875,
+      "completions/min_length": 855.0,
+      "completions/min_terminated_length": 855.0,
+      "entropy": 0.43789565935730934,
+      "epoch": 0.01839926402943882,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.0025193989276885986,
+      "learning_rate": 1e-05,
+      "loss": 0.0349,
+      "num_tokens": 13691110.0,
+      "reward": 0.5,
+      "reward_std": 0.45134252309799194,
+      "rewards/accuracy_reward/mean": 0.5,
+      "rewards/accuracy_reward/std": 0.5039526224136353,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.000011682510376,
+      "sampling/importance_sampling_ratio/min": 0.14047929644584656,
+      "sampling/sampling_logp_difference/max": 1.9626951217651367,
+      "sampling/sampling_logp_difference/mean": 0.015961986035108566,
+      "step": 40
+    },
+    {
+      "clip_ratio/high_max": 8.76178437465569e-05,
+      "clip_ratio/high_mean": 2.3123878236219753e-05,
+      "clip_ratio/low_mean": 0.00019285815869807266,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0002159820378437871,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16374.0,
+      "completions/mean_length": 4766.140625,
+      "completions/mean_terminated_length": 4194.77001953125,
+      "completions/min_length": 516.0,
+      "completions/min_terminated_length": 516.0,
+      "entropy": 0.47973789647221565,
+      "epoch": 0.018859245630174794,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0005962434806860983,
+      "learning_rate": 1e-05,
+      "loss": 0.0018,
+      "num_tokens": 14006911.0,
+      "reward": 0.484375,
+      "reward_std": 0.2382849156856537,
+      "rewards/accuracy_reward/mean": 0.484375,
+      "rewards/accuracy_reward/std": 0.5037065148353577,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000395774841309,
+      "sampling/importance_sampling_ratio/min": 0.12198832631111145,
+      "sampling/sampling_logp_difference/max": 2.103829860687256,
+      "sampling/sampling_logp_difference/mean": 0.016915298998355865,
+      "step": 41
+    },
+    {
+      "clip_ratio/high_max": 6.694088551739696e-05,
+      "clip_ratio/high_mean": 2.3428712665918283e-05,
+      "clip_ratio/low_mean": 0.0002706102432057378,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0002940389586001402,
+      "completions/clipped_ratio": 0.109375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14691.0,
+      "completions/mean_length": 5922.421875,
+      "completions/mean_terminated_length": 4637.66650390625,
+      "completions/min_length": 772.0,
+      "completions/min_terminated_length": 772.0,
+      "entropy": 0.42647283896803856,
+      "epoch": 0.019319227230910764,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.001872243476100266,
+      "learning_rate": 1e-05,
+      "loss": 0.0244,
+      "num_tokens": 14394946.0,
+      "reward": 0.4375,
+      "reward_std": 0.36295416951179504,
+      "rewards/accuracy_reward/mean": 0.4375,
+      "rewards/accuracy_reward/std": 0.5,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999639987945557,
+      "sampling/importance_sampling_ratio/min": 0.293357253074646,
+      "sampling/sampling_logp_difference/max": 2.1049091815948486,
+      "sampling/sampling_logp_difference/mean": 0.01656758040189743,
+      "step": 42
+    },
+    {
+      "clip_ratio/high_max": 0.00015323197931138566,
+      "clip_ratio/high_mean": 4.9833591447168146e-05,
+      "clip_ratio/low_mean": 0.00034982425768248504,
+      "clip_ratio/low_min": 1.088660519599216e-05,
+      "clip_ratio/region_mean": 0.0003996578489022795,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16255.0,
+      "completions/mean_length": 6493.1875,
+      "completions/mean_terminated_length": 6006.75390625,
+      "completions/min_length": 479.0,
+      "completions/min_terminated_length": 479.0,
+      "entropy": 0.4782983772456646,
+      "epoch": 0.019779208831646734,
+      "frac_reward_zero_std": 0.125,
+      "grad_norm": 0.00166318379342556,
+      "learning_rate": 1e-05,
+      "loss": 0.0511,
+      "num_tokens": 14821182.0,
+      "reward": 0.46875,
+      "reward_std": 0.4092700183391571,
+      "rewards/accuracy_reward/mean": 0.46875,
+      "rewards/accuracy_reward/std": 0.5029674172401428,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998992085456848,
+      "sampling/importance_sampling_ratio/min": 1.7716387219479657e-06,
+      "sampling/sampling_logp_difference/max": 13.243605613708496,
+      "sampling/sampling_logp_difference/mean": 0.018610000610351562,
+      "step": 43
+    },
+    {
+      "clip_ratio/high_max": 6.034070747773512e-05,
+      "clip_ratio/high_mean": 1.6863068026395922e-05,
+      "clip_ratio/low_mean": 9.460987712373026e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.00011147294480906567,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16269.0,
+      "completions/mean_length": 4648.546875,
+      "completions/mean_terminated_length": 4269.98388671875,
+      "completions/min_length": 665.0,
+      "completions/min_terminated_length": 665.0,
+      "entropy": 0.4597437307238579,
+      "epoch": 0.020239190432382703,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0008557081455364823,
+      "learning_rate": 1e-05,
+      "loss": 0.069,
+      "num_tokens": 15128561.0,
+      "reward": 0.328125,
+      "reward_std": 0.23144522309303284,
+      "rewards/accuracy_reward/mean": 0.328125,
+      "rewards/accuracy_reward/std": 0.4732423722743988,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999915957450867,
+      "sampling/importance_sampling_ratio/min": 0.2670474946498871,
+      "sampling/sampling_logp_difference/max": 1.320328712463379,
+      "sampling/sampling_logp_difference/mean": 0.016183078289031982,
+      "step": 44
+    },
+    {
+      "clip_ratio/high_max": 0.00016895902081159875,
+      "clip_ratio/high_mean": 6.0399999711080454e-05,
+      "clip_ratio/low_mean": 0.0002296717866556719,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.00029007178636675235,
+      "completions/clipped_ratio": 0.078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15720.0,
+      "completions/mean_length": 6930.234375,
+      "completions/mean_terminated_length": 6129.06787109375,
+      "completions/min_length": 682.0,
+      "completions/min_terminated_length": 682.0,
+      "entropy": 0.5115556567907333,
+      "epoch": 0.020699172033118676,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0016648141900077462,
+      "learning_rate": 1e-05,
+      "loss": 0.0232,
+      "num_tokens": 15582168.0,
+      "reward": 0.5625,
+      "reward_std": 0.3424547016620636,
+      "rewards/accuracy_reward/mean": 0.5625,
+      "rewards/accuracy_reward/std": 0.5,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000483989715576,
+      "sampling/importance_sampling_ratio/min": 0.187262162566185,
+      "sampling/sampling_logp_difference/max": 1.937586784362793,
+      "sampling/sampling_logp_difference/mean": 0.019788919016718864,
+      "step": 45
+    },
+    {
+      "clip_ratio/high_max": 9.100124134420184e-05,
+      "clip_ratio/high_mean": 3.351398640916159e-05,
+      "clip_ratio/low_mean": 0.000253890422754921,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0002874044093914563,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16321.0,
+      "completions/mean_length": 6264.671875,
+      "completions/mean_terminated_length": 5938.24169921875,
+      "completions/min_length": 227.0,
+      "completions/min_terminated_length": 227.0,
+      "entropy": 0.43167873099446297,
+      "epoch": 0.021159153633854646,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0013617790536955,
+      "learning_rate": 1e-05,
+      "loss": 0.0032,
+      "num_tokens": 15994715.0,
+      "reward": 0.640625,
+      "reward_std": 0.3766237497329712,
+      "rewards/accuracy_reward/mean": 0.640625,
+      "rewards/accuracy_reward/std": 0.4836103618144989,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999179840087891,
+      "sampling/importance_sampling_ratio/min": 0.1620832085609436,
+      "sampling/sampling_logp_difference/max": 1.8196454048156738,
+      "sampling/sampling_logp_difference/mean": 0.017889156937599182,
+      "step": 46
+    },
+    {
+      "clip_ratio/high_max": 6.15748222116963e-05,
+      "clip_ratio/high_mean": 1.870576988949324e-05,
+      "clip_ratio/low_mean": 0.0003191337254975224,
+      "clip_ratio/low_min": 4.877414176007733e-05,
+      "clip_ratio/region_mean": 0.0003378394994797418,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 12838.0,
+      "completions/mean_length": 4168.140625,
+      "completions/mean_terminated_length": 3974.23828125,
+      "completions/min_length": 705.0,
+      "completions/min_terminated_length": 705.0,
+      "entropy": 0.433504331856966,
+      "epoch": 0.021619135234590615,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.003133355872705579,
+      "learning_rate": 1e-05,
+      "loss": 0.0478,
+      "num_tokens": 16272044.0,
+      "reward": 0.34375,
+      "reward_std": 0.3377465009689331,
+      "rewards/accuracy_reward/mean": 0.34375,
+      "rewards/accuracy_reward/std": 0.4787135720252991,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998913407325745,
+      "sampling/importance_sampling_ratio/min": 0.38697248697280884,
+      "sampling/sampling_logp_difference/max": 1.4266910552978516,
+      "sampling/sampling_logp_difference/mean": 0.014272443950176239,
+      "step": 47
+    },
+    {
+      "clip_ratio/high_max": 5.0198698772874195e-05,
+      "clip_ratio/high_mean": 1.2549674693218549e-05,
+      "clip_ratio/low_mean": 0.00024944932374637574,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0002619989991217153,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14382.0,
+      "completions/mean_length": 5228.15625,
+      "completions/mean_terminated_length": 4868.2900390625,
+      "completions/min_length": 1099.0,
+      "completions/min_terminated_length": 1099.0,
+      "entropy": 0.6134471148252487,
+      "epoch": 0.02207911683532659,
+      "frac_reward_zero_std": 0.125,
+      "grad_norm": 0.002945883432403207,
+      "learning_rate": 1e-05,
+      "loss": 0.0237,
+      "num_tokens": 16616510.0,
+      "reward": 0.453125,
+      "reward_std": 0.39560043811798096,
+      "rewards/accuracy_reward/mean": 0.453125,
+      "rewards/accuracy_reward/std": 0.501733124256134,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.000022053718567,
+      "sampling/importance_sampling_ratio/min": 0.23671367764472961,
+      "sampling/sampling_logp_difference/max": 1.4409040212631226,
+      "sampling/sampling_logp_difference/mean": 0.01892893575131893,
+      "step": 48
+    },
+    {
+      "clip_ratio/high_max": 0.00010992094757966697,
+      "clip_ratio/high_mean": 3.773104890569812e-05,
+      "clip_ratio/low_mean": 0.0002085948569856555,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0002463259042997379,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15580.0,
+      "completions/max_terminated_length": 15580.0,
+      "completions/mean_length": 4286.90625,
+      "completions/mean_terminated_length": 4286.90625,
+      "completions/min_length": 430.0,
+      "completions/min_terminated_length": 430.0,
+      "entropy": 0.3194341119378805,
+      "epoch": 0.022539098436062558,
+      "frac_reward_zero_std": 0.125,
+      "grad_norm": 0.0033129912335425615,
+      "learning_rate": 1e-05,
+      "loss": -0.0135,
+      "num_tokens": 16903128.0,
+      "reward": 0.578125,
+      "reward_std": 0.4113916754722595,
+      "rewards/accuracy_reward/mean": 0.578125,
+      "rewards/accuracy_reward/std": 0.49776285886764526,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000847578048706,
+      "sampling/importance_sampling_ratio/min": 0.14042755961418152,
+      "sampling/sampling_logp_difference/max": 1.9630634784698486,
+      "sampling/sampling_logp_difference/mean": 0.0129241943359375,
+      "step": 49
+    },
+    {
+      "clip_ratio/high_max": 0.00010812897107825847,
+      "clip_ratio/high_mean": 3.162783127663715e-05,
+      "clip_ratio/low_mean": 0.0001828691292757867,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.00021449696214403957,
+      "completions/clipped_ratio": 0.078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15964.0,
+      "completions/mean_length": 5032.125,
+      "completions/mean_terminated_length": 4070.101806640625,
+      "completions/min_length": 441.0,
+      "completions/min_terminated_length": 441.0,
+      "entropy": 0.4777919165790081,
+      "epoch": 0.022999080036798528,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0021068111527711153,
+      "learning_rate": 1e-05,
+      "loss": -0.0866,
+      "num_tokens": 17236504.0,
+      "reward": 0.515625,
+      "reward_std": 0.29826053977012634,
+      "rewards/accuracy_reward/mean": 0.515625,
+      "rewards/accuracy_reward/std": 0.5037065148353577,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000327825546265,
+      "sampling/importance_sampling_ratio/min": 0.2832590341567993,
+      "sampling/sampling_logp_difference/max": 1.8220746517181396,
+      "sampling/sampling_logp_difference/mean": 0.01738543063402176,
+      "step": 50
+    },
+    {
+      "clip_ratio/high_max": 0.00012820017036574427,
+      "clip_ratio/high_mean": 3.647331323008984e-05,
+      "clip_ratio/low_mean": 0.00025561100665072445,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0002920843198808143,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 13201.0,
+      "completions/mean_length": 4803.203125,
+      "completions/mean_terminated_length": 4619.38134765625,
+      "completions/min_length": 530.0,
+      "completions/min_terminated_length": 530.0,
+      "entropy": 0.4494751952588558,
+      "epoch": 0.023459061637534497,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0028032760601490736,
+      "learning_rate": 1e-05,
+      "loss": 0.0433,
+      "num_tokens": 17553269.0,
+      "reward": 0.609375,
+      "reward_std": 0.3403330445289612,
+      "rewards/accuracy_reward/mean": 0.609375,
+      "rewards/accuracy_reward/std": 0.4917473793029785,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999998152256012,
+      "sampling/importance_sampling_ratio/min": 0.21100811660289764,
+      "sampling/sampling_logp_difference/max": 1.5558586120605469,
+      "sampling/sampling_logp_difference/mean": 0.01737060397863388,
+      "step": 51
+    },
+    {
+      "clip_ratio/high_max": 0.00010267168681821204,
+      "clip_ratio/high_mean": 3.3487939049337e-05,
+      "clip_ratio/low_mean": 0.00015384274320240365,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.00018733068225174065,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16160.0,
+      "completions/mean_length": 7100.3125,
+      "completions/mean_terminated_length": 6643.7373046875,
+      "completions/min_length": 1183.0,
+      "completions/min_terminated_length": 1183.0,
+      "entropy": 0.5009776279330254,
+      "epoch": 0.02391904323827047,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.001591994776390493,
+      "learning_rate": 1e-05,
+      "loss": -0.0421,
+      "num_tokens": 18016729.0,
+      "reward": 0.453125,
+      "reward_std": 0.28930896520614624,
+      "rewards/accuracy_reward/mean": 0.453125,
+      "rewards/accuracy_reward/std": 0.501733124256134,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000343322753906,
+      "sampling/importance_sampling_ratio/min": 0.09941783547401428,
+      "sampling/sampling_logp_difference/max": 2.3084237575531006,
+      "sampling/sampling_logp_difference/mean": 0.01882891170680523,
+      "step": 52
+    },
+    {
+      "clip_ratio/high_max": 0.00016665930297676823,
+      "clip_ratio/high_mean": 5.2525359819810546e-05,
+      "clip_ratio/low_mean": 0.0004211304803902749,
+      "clip_ratio/low_min": 9.529018279863521e-05,
+      "clip_ratio/region_mean": 0.0004736558298645832,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14227.0,
+      "completions/mean_length": 6233.796875,
+      "completions/mean_terminated_length": 5557.1171875,
+      "completions/min_length": 1338.0,
+      "completions/min_terminated_length": 1338.0,
+      "entropy": 0.48881014063954353,
+      "epoch": 0.02437902483900644,
+      "frac_reward_zero_std": 0.125,
+      "grad_norm": 0.003694011364132166,
+      "learning_rate": 1e-05,
+      "loss": 0.1627,
+      "num_tokens": 18426140.0,
+      "reward": 0.625,
+      "reward_std": 0.3977220952510834,
+      "rewards/accuracy_reward/mean": 0.625,
+      "rewards/accuracy_reward/std": 0.48795005679130554,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000104904174805,
+      "sampling/importance_sampling_ratio/min": 0.20072485506534576,
+      "sampling/sampling_logp_difference/max": 1.6058201789855957,
+      "sampling/sampling_logp_difference/mean": 0.01879170536994934,
+      "step": 53
+    },
+    {
+      "clip_ratio/high_max": 0.00012100895446565119,
+      "clip_ratio/high_mean": 4.9377299660591234e-05,
+      "clip_ratio/low_mean": 0.00019421957949816715,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.00024359687631658744,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15854.0,
+      "completions/mean_length": 5629.03125,
+      "completions/mean_terminated_length": 5282.0966796875,
+      "completions/min_length": 177.0,
+      "completions/min_terminated_length": 177.0,
+      "entropy": 0.3631018362939358,
+      "epoch": 0.02483900643974241,
+      "frac_reward_zero_std": 0.125,
+      "grad_norm": 0.001484633656218648,
+      "learning_rate": 1e-05,
+      "loss": 0.0571,
+      "num_tokens": 18794958.0,
+      "reward": 0.609375,
+      "reward_std": 0.4050365090370178,
+      "rewards/accuracy_reward/mean": 0.609375,
+      "rewards/accuracy_reward/std": 0.4917473793029785,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.000042200088501,
+      "sampling/importance_sampling_ratio/min": 0.002677773591130972,
+      "sampling/sampling_logp_difference/max": 5.922769546508789,
+      "sampling/sampling_logp_difference/mean": 0.013976464979350567,
+      "step": 54
+    },
+    {
+      "clip_ratio/high_max": 0.00021361040307965595,
+      "clip_ratio/high_mean": 8.756921079111635e-05,
+      "clip_ratio/low_mean": 0.0002042179089585261,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.00029178711429267423,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16115.0,
+      "completions/mean_length": 5366.453125,
+      "completions/mean_terminated_length": 5191.57177734375,
+      "completions/min_length": 593.0,
+      "completions/min_terminated_length": 593.0,
+      "entropy": 0.34573371335864067,
+      "epoch": 0.025298988040478382,
+      "frac_reward_zero_std": 0.125,
+      "grad_norm": 0.0018017840338870883,
+      "learning_rate": 1e-05,
+      "loss": -0.0307,
+      "num_tokens": 19148275.0,
+      "reward": 0.734375,
+      "reward_std": 0.4050365090370178,
+      "rewards/accuracy_reward/mean": 0.734375,
+      "rewards/accuracy_reward/std": 0.44515693187713623,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999160766601562,
+      "sampling/importance_sampling_ratio/min": 0.22769968211650848,
+      "sampling/sampling_logp_difference/max": 1.4797277450561523,
+      "sampling/sampling_logp_difference/mean": 0.014456957578659058,
+      "step": 55
+    },
+    {
+      "clip_ratio/high_max": 0.00020042336745973444,
+      "clip_ratio/high_mean": 5.850923639627581e-05,
+      "clip_ratio/low_mean": 0.00019344742031535134,
+      "clip_ratio/low_min": 1.594387686054688e-05,
+      "clip_ratio/region_mean": 0.0002519566587579902,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15942.0,
+      "completions/mean_length": 5801.921875,
+      "completions/mean_terminated_length": 5460.564453125,
+      "completions/min_length": 538.0,
+      "completions/min_terminated_length": 538.0,
+      "entropy": 0.4420101195573807,
+      "epoch": 0.025758969641214352,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0007390208193100989,
+      "learning_rate": 1e-05,
+      "loss": 0.0368,
+      "num_tokens": 19530718.0,
+      "reward": 0.421875,
+      "reward_std": 0.2993341088294983,
+      "rewards/accuracy_reward/mean": 0.421875,
+      "rewards/accuracy_reward/std": 0.49776285886764526,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999010562896729,
+      "sampling/importance_sampling_ratio/min": 0.04691341519355774,
+      "sampling/sampling_logp_difference/max": 3.0594515800476074,
+      "sampling/sampling_logp_difference/mean": 0.016371876001358032,
+      "step": 56
+    },
+    {
+      "clip_ratio/high_max": 0.0001929260479300865,
+      "clip_ratio/high_mean": 7.267188334481034e-05,
+      "clip_ratio/low_mean": 0.00013643273086927366,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.00020910461648782075,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15886.0,
+      "completions/max_terminated_length": 15886.0,
+      "completions/mean_length": 3581.09375,
+      "completions/mean_terminated_length": 3581.09375,
+      "completions/min_length": 615.0,
+      "completions/min_terminated_length": 615.0,
+      "entropy": 0.36750902235507965,
+      "epoch": 0.02621895124195032,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0020201546140015125,
+      "learning_rate": 1e-05,
+      "loss": 0.1245,
+      "num_tokens": 19771076.0,
+      "reward": 0.578125,
+      "reward_std": 0.2688094973564148,
+      "rewards/accuracy_reward/mean": 0.578125,
+      "rewards/accuracy_reward/std": 0.49776285886764526,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.000082015991211,
+      "sampling/importance_sampling_ratio/min": 0.21508392691612244,
+      "sampling/sampling_logp_difference/max": 2.204270362854004,
+      "sampling/sampling_logp_difference/mean": 0.013558689504861832,
+      "step": 57
+    },
+    {
+      "clip_ratio/high_max": 0.00019395453546167118,
+      "clip_ratio/high_mean": 6.426821187233145e-05,
+      "clip_ratio/low_mean": 0.00017469121939939214,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.00023895943377283402,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 14150.0,
+      "completions/max_terminated_length": 14150.0,
+      "completions/mean_length": 4180.46875,
+      "completions/mean_terminated_length": 4180.46875,
+      "completions/min_length": 251.0,
+      "completions/min_terminated_length": 251.0,
+      "entropy": 0.4649594761431217,
+      "epoch": 0.02667893284268629,
+      "frac_reward_zero_std": 0.125,
+      "grad_norm": 0.0028552189469337463,
+      "learning_rate": 1e-05,
+      "loss": 0.0924,
+      "num_tokens": 20048138.0,
+      "reward": 0.53125,
+      "reward_std": 0.4276576042175293,
+      "rewards/accuracy_reward/mean": 0.53125,
+      "rewards/accuracy_reward/std": 0.5029674172401428,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000083446502686,
+      "sampling/importance_sampling_ratio/min": 0.2393883913755417,
+      "sampling/sampling_logp_difference/max": 1.4296680688858032,
+      "sampling/sampling_logp_difference/mean": 0.017490293830633163,
+      "step": 58
+    },
+    {
+      "clip_ratio/high_max": 0.00014915554584149504,
+      "clip_ratio/high_mean": 3.9898490058476455e-05,
+      "clip_ratio/low_mean": 5.383538700698409e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 9.373387524647114e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15323.0,
+      "completions/max_terminated_length": 15323.0,
+      "completions/mean_length": 4642.15625,
+      "completions/mean_terminated_length": 4642.15625,
+      "completions/min_length": 403.0,
+      "completions/min_terminated_length": 403.0,
+      "entropy": 0.41386983543634415,
+      "epoch": 0.027138914443422264,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0014837872004136443,
+      "learning_rate": 1e-05,
+      "loss": -0.0232,
+      "num_tokens": 20355020.0,
+      "reward": 0.65625,
+      "reward_std": 0.3198433816432953,
+      "rewards/accuracy_reward/mean": 0.65625,
+      "rewards/accuracy_reward/std": 0.4787135720252991,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0001411437988281,
+      "sampling/importance_sampling_ratio/min": 0.022514859214425087,
+      "sampling/sampling_logp_difference/max": 3.7935798168182373,
+      "sampling/sampling_logp_difference/mean": 0.015344480983912945,
+      "step": 59
+    },
+    {
+      "clip_ratio/high_max": 7.379077214864083e-05,
+      "clip_ratio/high_mean": 2.223373576271115e-05,
+      "clip_ratio/low_mean": 0.00013174474815969006,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0001539784839224012,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15170.0,
+      "completions/max_terminated_length": 15170.0,
+      "completions/mean_length": 3369.015625,
+      "completions/mean_terminated_length": 3369.015625,
+      "completions/min_length": 267.0,
+      "completions/min_terminated_length": 267.0,
+      "entropy": 0.46293293312191963,
+      "epoch": 0.027598896044158234,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0023857210762798786,
+      "learning_rate": 1e-05,
+      "loss": 0.0587,
+      "num_tokens": 20579309.0,
+      "reward": 0.40625,
+      "reward_std": 0.29143065214157104,
+      "rewards/accuracy_reward/mean": 0.40625,
+      "rewards/accuracy_reward/std": 0.49501484632492065,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999541640281677,
+      "sampling/importance_sampling_ratio/min": 0.00012647465337067842,
+      "sampling/sampling_logp_difference/max": 8.975468635559082,
+      "sampling/sampling_logp_difference/mean": 0.016323832795023918,
+      "step": 60
+    },
+    {
+      "clip_ratio/high_max": 0.00010131701310456265,
+      "clip_ratio/high_mean": 3.068578371312469e-05,
+      "clip_ratio/low_mean": 0.00017564234258315992,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0002063281253867899,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15267.0,
+      "completions/mean_length": 4186.265625,
+      "completions/mean_terminated_length": 3992.651123046875,
+      "completions/min_length": 636.0,
+      "completions/min_terminated_length": 636.0,
+      "entropy": 0.4424850195646286,
+      "epoch": 0.028058877644894203,
+      "frac_reward_zero_std": 0.125,
+      "grad_norm": 0.001888959901407361,
+      "learning_rate": 1e-05,
+      "loss": -0.0867,
+      "num_tokens": 20858230.0,
+      "reward": 0.5,
+      "reward_std": 0.43401283025741577,
+      "rewards/accuracy_reward/mean": 0.5,
+      "rewards/accuracy_reward/std": 0.5039526224136353,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0001115798950195,
+      "sampling/importance_sampling_ratio/min": 0.21523967385292053,
+      "sampling/sampling_logp_difference/max": 1.5360031127929688,
+      "sampling/sampling_logp_difference/mean": 0.015638090670108795,
+      "step": 61
+    },
+    {
+      "clip_ratio/high_max": 0.00018883940902014729,
+      "clip_ratio/high_mean": 6.83412895341462e-05,
+      "clip_ratio/low_mean": 0.00029582804199890234,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0003641693292593118,
+      "completions/clipped_ratio": 0.109375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15784.0,
+      "completions/mean_length": 8232.328125,
+      "completions/mean_terminated_length": 7231.24560546875,
+      "completions/min_length": 532.0,
+      "completions/min_terminated_length": 532.0,
+      "entropy": 0.4720785431563854,
+      "epoch": 0.028518859245630176,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0010464832885190845,
+      "learning_rate": 1e-05,
+      "loss": 0.0678,
+      "num_tokens": 21394763.0,
+      "reward": 0.421875,
+      "reward_std": 0.30617380142211914,
+      "rewards/accuracy_reward/mean": 0.421875,
+      "rewards/accuracy_reward/std": 0.49776285886764526,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999436736106873,
+      "sampling/importance_sampling_ratio/min": 0.05187493562698364,
+      "sampling/sampling_logp_difference/max": 2.9589195251464844,
+      "sampling/sampling_logp_difference/mean": 0.019340507686138153,
+      "step": 62
+    },
+    {
+      "clip_ratio/high_max": 7.807558085914934e-05,
+      "clip_ratio/high_mean": 2.2267657527663687e-05,
+      "clip_ratio/low_mean": 0.0001811299157452595,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.00020339757793408353,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15783.0,
+      "completions/mean_length": 6065.875,
+      "completions/mean_terminated_length": 5558.42578125,
+      "completions/min_length": 763.0,
+      "completions/min_terminated_length": 763.0,
+      "entropy": 0.5249982811510563,
+      "epoch": 0.028978840846366146,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0016154105542227626,
+      "learning_rate": 1e-05,
+      "loss": 0.1536,
+      "num_tokens": 21793091.0,
+      "reward": 0.40625,
+      "reward_std": 0.2756394147872925,
+      "rewards/accuracy_reward/mean": 0.40625,
+      "rewards/accuracy_reward/std": 0.49501484632492065,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998780488967896,
+      "sampling/importance_sampling_ratio/min": 0.05374135076999664,
+      "sampling/sampling_logp_difference/max": 2.923572540283203,
+      "sampling/sampling_logp_difference/mean": 0.017961012199521065,
+      "step": 63
+    },
+    {
+      "clip_ratio/high_max": 3.358934282005066e-05,
+      "clip_ratio/high_mean": 8.397335705012665e-06,
+      "clip_ratio/low_mean": 3.994480266555911e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.834213746107707e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15670.0,
+      "completions/mean_length": 5830.015625,
+      "completions/mean_terminated_length": 5489.564453125,
+      "completions/min_length": 487.0,
+      "completions/min_terminated_length": 487.0,
+      "entropy": 0.49247242510318756,
+      "epoch": 0.029438822447102116,
+      "frac_reward_zero_std": 0.625,
+      "grad_norm": 0.0013925280654802918,
+      "learning_rate": 1e-05,
+      "loss": 0.0145,
+      "num_tokens": 22176908.0,
+      "reward": 0.375,
+      "reward_std": 0.1872510462999344,
+      "rewards/accuracy_reward/mean": 0.375,
+      "rewards/accuracy_reward/std": 0.48795005679130554,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000190734863281,
+      "sampling/importance_sampling_ratio/min": 0.00015296634228434414,
+      "sampling/sampling_logp_difference/max": 8.785292625427246,
+      "sampling/sampling_logp_difference/mean": 0.016575772315263748,
+      "step": 64
+    },
+    {
+      "clip_ratio/high_max": 0.00016343776496796636,
+      "clip_ratio/high_mean": 4.387032890917908e-05,
+      "clip_ratio/low_mean": 0.00010361431054661807,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.00014748463922842348,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 11830.0,
+      "completions/max_terminated_length": 11830.0,
+      "completions/mean_length": 3988.203125,
+      "completions/mean_terminated_length": 3988.203125,
+      "completions/min_length": 434.0,
+      "completions/min_terminated_length": 434.0,
+      "entropy": 0.5831322409212589,
+      "epoch": 0.029898804047838085,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0023537185043096542,
+      "learning_rate": 1e-05,
+      "loss": 0.0137,
+      "num_tokens": 22443753.0,
+      "reward": 0.5625,
+      "reward_std": 0.33090677857398987,
+      "rewards/accuracy_reward/mean": 0.5625,
+      "rewards/accuracy_reward/std": 0.5,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999172687530518,
+      "sampling/importance_sampling_ratio/min": 0.017818376421928406,
+      "sampling/sampling_logp_difference/max": 4.027524948120117,
+      "sampling/sampling_logp_difference/mean": 0.01679972931742668,
+      "step": 65
+    },
+    {
+      "clip_ratio/high_max": 7.745890934529598e-05,
+      "clip_ratio/high_mean": 2.375019573719328e-05,
+      "clip_ratio/low_mean": 0.0002563541038398398,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0002801043035560724,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15926.0,
+      "completions/mean_length": 6300.890625,
+      "completions/mean_terminated_length": 5804.99951171875,
+      "completions/min_length": 1197.0,
+      "completions/min_terminated_length": 1197.0,
+      "entropy": 0.45622409880161285,
+      "epoch": 0.03035878564857406,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0014021627139300108,
+      "learning_rate": 1e-05,
+      "loss": 0.147,
+      "num_tokens": 22857290.0,
+      "reward": 0.5,
+      "reward_std": 0.378745436668396,
+      "rewards/accuracy_reward/mean": 0.5,
+      "rewards/accuracy_reward/std": 0.5039526224136353,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.000030517578125,
+      "sampling/importance_sampling_ratio/min": 0.30470171570777893,
+      "sampling/sampling_logp_difference/max": 1.5125246047973633,
+      "sampling/sampling_logp_difference/mean": 0.017332255840301514,
+      "step": 66
+    },
+    {
+      "clip_ratio/high_max": 9.111399231187534e-05,
+      "clip_ratio/high_mean": 2.768481340353901e-05,
+      "clip_ratio/low_mean": 0.00022677685137750814,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.00025446166773690493,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 12245.0,
+      "completions/mean_length": 5160.984375,
+      "completions/mean_terminated_length": 4609.03271484375,
+      "completions/min_length": 757.0,
+      "completions/min_terminated_length": 757.0,
+      "entropy": 0.41627733781933784,
+      "epoch": 0.030818767249310028,
+      "frac_reward_zero_std": 0.125,
+      "grad_norm": 0.0016577172791585326,
+      "learning_rate": 1e-05,
+      "loss": 0.0101,
+      "num_tokens": 23198369.0,
+      "reward": 0.6875,
+      "reward_std": 0.3729792833328247,
+      "rewards/accuracy_reward/mean": 0.6875,
+      "rewards/accuracy_reward/std": 0.467176616191864,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000255107879639,
+      "sampling/importance_sampling_ratio/min": 0.3549080193042755,
+      "sampling/sampling_logp_difference/max": 1.094315528869629,
+      "sampling/sampling_logp_difference/mean": 0.016087274998426437,
+      "step": 67
+    },
+    {
+      "clip_ratio/high_max": 0.0001335845809080638,
+      "clip_ratio/high_mean": 4.6601401209045434e-05,
+      "clip_ratio/low_mean": 0.00029043503491266165,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0003370364320289809,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14731.0,
+      "completions/mean_length": 4509.109375,
+      "completions/mean_terminated_length": 3717.4501953125,
+      "completions/min_length": 494.0,
+      "completions/min_terminated_length": 494.0,
+      "entropy": 0.42583196237683296,
+      "epoch": 0.031278748850046,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.0033681185450404882,
+      "learning_rate": 1e-05,
+      "loss": -0.0017,
+      "num_tokens": 23495008.0,
+      "reward": 0.609375,
+      "reward_std": 0.49446311593055725,
+      "rewards/accuracy_reward/mean": 0.609375,
+      "rewards/accuracy_reward/std": 0.4917473793029785,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998528361320496,
+      "sampling/importance_sampling_ratio/min": 0.00014199235010892153,
+      "sampling/sampling_logp_difference/max": 8.859737396240234,
+      "sampling/sampling_logp_difference/mean": 0.016656186431646347,
+      "step": 68
+    },
+    {
+      "clip_ratio/high_max": 0.00010585653399175499,
+      "clip_ratio/high_mean": 3.166284977851319e-05,
+      "clip_ratio/low_mean": 0.00032884415986700333,
+      "clip_ratio/low_min": 3.282563193351962e-05,
+      "clip_ratio/region_mean": 0.00036050701601197943,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14715.0,
+      "completions/mean_length": 6722.671875,
+      "completions/mean_terminated_length": 6411.01611328125,
+      "completions/min_length": 860.0,
+      "completions/min_terminated_length": 860.0,
+      "entropy": 0.5157046765089035,
+      "epoch": 0.03173873045078197,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.00212118704803288,
+      "learning_rate": 1e-05,
+      "loss": -0.0105,
+      "num_tokens": 23934899.0,
+      "reward": 0.328125,
+      "reward_std": 0.2414703369140625,
+      "rewards/accuracy_reward/mean": 0.328125,
+      "rewards/accuracy_reward/std": 0.4732423722743988,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000383853912354,
+      "sampling/importance_sampling_ratio/min": 0.15753091871738434,
+      "sampling/sampling_logp_difference/max": 1.8481335639953613,
+      "sampling/sampling_logp_difference/mean": 0.019955601543188095,
+      "step": 69
+    },
+    {
+      "clip_ratio/high_max": 0.00011434509951868677,
+      "clip_ratio/high_mean": 3.723538395661308e-05,
+      "clip_ratio/low_mean": 0.00033702207656460814,
+      "clip_ratio/low_min": 1.756851634127088e-05,
+      "clip_ratio/region_mean": 0.00037425746631924994,
+      "completions/clipped_ratio": 0.15625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15667.0,
+      "completions/mean_length": 8077.90625,
+      "completions/mean_terminated_length": 6539.74072265625,
+      "completions/min_length": 825.0,
+      "completions/min_terminated_length": 825.0,
+      "entropy": 0.45871395990252495,
+      "epoch": 0.03219871205151794,
+      "frac_reward_zero_std": 0.125,
+      "grad_norm": 0.001251625595614314,
+      "learning_rate": 1e-05,
+      "loss": 0.0493,
+      "num_tokens": 24461341.0,
+      "reward": 0.453125,
+      "reward_std": 0.38664889335632324,
+      "rewards/accuracy_reward/mean": 0.453125,
+      "rewards/accuracy_reward/std": 0.501733124256134,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999666213989258,
+      "sampling/importance_sampling_ratio/min": 0.11547781527042389,
+      "sampling/sampling_logp_difference/max": 2.158676862716675,
+      "sampling/sampling_logp_difference/mean": 0.019339658319950104,
+      "step": 70
+    },
+    {
+      "clip_ratio/high_max": 0.00010712722541939002,
+      "clip_ratio/high_mean": 3.323841019664542e-05,
+      "clip_ratio/low_mean": 0.0001494285193075484,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.00018266692586621502,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16074.0,
+      "completions/mean_length": 5004.703125,
+      "completions/mean_terminated_length": 4445.0654296875,
+      "completions/min_length": 729.0,
+      "completions/min_terminated_length": 729.0,
+      "entropy": 0.5422494150698185,
+      "epoch": 0.03265869365225391,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0019212040351703763,
+      "learning_rate": 1e-05,
+      "loss": 0.0703,
+      "num_tokens": 24791282.0,
+      "reward": 0.578125,
+      "reward_std": 0.31983357667922974,
+      "rewards/accuracy_reward/mean": 0.578125,
+      "rewards/accuracy_reward/std": 0.49776285886764526,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000091791152954,
+      "sampling/importance_sampling_ratio/min": 0.2572252154350281,
+      "sampling/sampling_logp_difference/max": 1.4805357456207275,
+      "sampling/sampling_logp_difference/mean": 0.016796359792351723,
+      "step": 71
+    },
+    {
+      "clip_ratio/high_max": 0.00014950770128052682,
+      "clip_ratio/high_mean": 4.267084386810893e-05,
+      "clip_ratio/low_mean": 7.438720058416948e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.00011705804536177311,
+      "completions/clipped_ratio": 0.15625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16099.0,
+      "completions/mean_length": 5883.796875,
+      "completions/mean_terminated_length": 3939.31494140625,
+      "completions/min_length": 510.0,
+      "completions/min_terminated_length": 510.0,
+      "entropy": 0.35789375379681587,
+      "epoch": 0.03311867525298988,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0005889610038138926,
+      "learning_rate": 1e-05,
+      "loss": -0.0215,
+      "num_tokens": 25176237.0,
+      "reward": 0.46875,
+      "reward_std": 0.23356688022613525,
+      "rewards/accuracy_reward/mean": 0.46875,
+      "rewards/accuracy_reward/std": 0.5029674172401428,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999967813491821,
+      "sampling/importance_sampling_ratio/min": 0.08377405256032944,
+      "sampling/sampling_logp_difference/max": 2.4796319007873535,
+      "sampling/sampling_logp_difference/mean": 0.014260279014706612,
+      "step": 72
+    },
+    {
+      "clip_ratio/high_max": 0.00012918999300381984,
+      "clip_ratio/high_mean": 5.179685820166924e-05,
+      "clip_ratio/low_mean": 0.00011270135428276262,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.00016449821669084486,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 12097.0,
+      "completions/mean_length": 5086.890625,
+      "completions/mean_terminated_length": 4722.4677734375,
+      "completions/min_length": 630.0,
+      "completions/min_terminated_length": 630.0,
+      "entropy": 0.38051650673151016,
+      "epoch": 0.03357865685372585,
+      "frac_reward_zero_std": 0.125,
+      "grad_norm": 0.003188627539202571,
+      "learning_rate": 1e-05,
+      "loss": 0.0989,
+      "num_tokens": 25517502.0,
+      "reward": 0.828125,
+      "reward_std": 0.3571978807449341,
+      "rewards/accuracy_reward/mean": 0.828125,
+      "rewards/accuracy_reward/std": 0.38025420904159546,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000228881835938,
+      "sampling/importance_sampling_ratio/min": 0.19293053448200226,
+      "sampling/sampling_logp_difference/max": 1.6454250812530518,
+      "sampling/sampling_logp_difference/mean": 0.013862463645637035,
+      "step": 73
+    },
+    {
+      "clip_ratio/high_max": 4.800585429620696e-05,
+      "clip_ratio/high_mean": 1.9420242779233376e-05,
+      "clip_ratio/low_mean": 6.698135666738381e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 8.640159967399086e-05,
+      "completions/clipped_ratio": 0.125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16214.0,
+      "completions/mean_length": 7180.609375,
+      "completions/mean_terminated_length": 5865.83935546875,
+      "completions/min_length": 1171.0,
+      "completions/min_terminated_length": 1171.0,
+      "entropy": 0.47618816792964935,
+      "epoch": 0.03403863845446182,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0008011905592866242,
+      "learning_rate": 1e-05,
+      "loss": -0.0153,
+      "num_tokens": 25986021.0,
+      "reward": 0.546875,
+      "reward_std": 0.2519446909427643,
+      "rewards/accuracy_reward/mean": 0.546875,
+      "rewards/accuracy_reward/std": 0.501733124256134,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999934732913971,
+      "sampling/importance_sampling_ratio/min": 0.17329953610897064,
+      "sampling/sampling_logp_difference/max": 1.7527337074279785,
+      "sampling/sampling_logp_difference/mean": 0.017364704981446266,
+      "step": 74
+    },
+    {
+      "clip_ratio/high_max": 0.0001653397707741533,
+      "clip_ratio/high_mean": 4.513738815603574e-05,
+      "clip_ratio/low_mean": 0.0003383910643606214,
+      "clip_ratio/low_min": 2.9063008696539328e-05,
+      "clip_ratio/region_mean": 0.0003835284496744862,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15720.0,
+      "completions/mean_length": 6114.203125,
+      "completions/mean_terminated_length": 5429.55029296875,
+      "completions/min_length": 381.0,
+      "completions/min_terminated_length": 381.0,
+      "entropy": 0.4914289750158787,
+      "epoch": 0.03449862005519779,
+      "frac_reward_zero_std": 0.125,
+      "grad_norm": 0.0013586069690063596,
+      "learning_rate": 1e-05,
+      "loss": -0.0356,
+      "num_tokens": 26387378.0,
+      "reward": 0.453125,
+      "reward_std": 0.4050365090370178,
+      "rewards/accuracy_reward/mean": 0.453125,
+      "rewards/accuracy_reward/std": 0.501733124256134,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000791549682617,
+      "sampling/importance_sampling_ratio/min": 0.07695373892784119,
+      "sampling/sampling_logp_difference/max": 2.5645508766174316,
+      "sampling/sampling_logp_difference/mean": 0.019334372133016586,
+      "step": 75
+    },
+    {
+      "clip_ratio/high_max": 3.184090246577398e-05,
+      "clip_ratio/high_mean": 1.024358095946809e-05,
+      "clip_ratio/low_mean": 0.00011936229930142872,
+      "clip_ratio/low_min": 5.828592748002848e-06,
+      "clip_ratio/region_mean": 0.0001296058802608968,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14485.0,
+      "completions/mean_length": 7263.515625,
+      "completions/mean_terminated_length": 6655.48388671875,
+      "completions/min_length": 713.0,
+      "completions/min_terminated_length": 713.0,
+      "entropy": 0.4553263336420059,
+      "epoch": 0.034958601655933765,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0006440540309995413,
+      "learning_rate": 1e-05,
+      "loss": 0.0767,
+      "num_tokens": 26861243.0,
+      "reward": 0.546875,
+      "reward_std": 0.2382849156856537,
+      "rewards/accuracy_reward/mean": 0.546875,
+      "rewards/accuracy_reward/std": 0.501733124256134,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999980092048645,
+      "sampling/importance_sampling_ratio/min": 0.15337347984313965,
+      "sampling/sampling_logp_difference/max": 1.8748793601989746,
+      "sampling/sampling_logp_difference/mean": 0.016627371311187744,
+      "step": 76
+    },
+    {
+      "clip_ratio/high_max": 5.934812543273438e-05,
+      "clip_ratio/high_mean": 1.4837031358183594e-05,
+      "clip_ratio/low_mean": 0.00015511889660047018,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.00016995592795865377,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 13194.0,
+      "completions/max_terminated_length": 13194.0,
+      "completions/mean_length": 3835.0625,
+      "completions/mean_terminated_length": 3835.0625,
+      "completions/min_length": 528.0,
+      "completions/min_terminated_length": 528.0,
+      "entropy": 0.5679256543517113,
+      "epoch": 0.03541858325666973,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0015393231296911836,
+      "learning_rate": 1e-05,
+      "loss": 0.0614,
+      "num_tokens": 27117047.0,
+      "reward": 0.515625,
+      "reward_std": 0.28930896520614624,
+      "rewards/accuracy_reward/mean": 0.515625,
+      "rewards/accuracy_reward/std": 0.5037065148353577,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999722242355347,
+      "sampling/importance_sampling_ratio/min": 0.1471611112356186,
+      "sampling/sampling_logp_difference/max": 1.9162273406982422,
+      "sampling/sampling_logp_difference/mean": 0.017565816640853882,
+      "step": 77
+    },
+    {
+      "clip_ratio/high_max": 0.0001033300140989013,
+      "clip_ratio/high_mean": 3.157118726448971e-05,
+      "clip_ratio/low_mean": 0.00023221444325827179,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.00026378563097750884,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 14929.0,
+      "completions/max_terminated_length": 14929.0,
+      "completions/mean_length": 5001.0625,
+      "completions/mean_terminated_length": 5001.0625,
+      "completions/min_length": 588.0,
+      "completions/min_terminated_length": 588.0,
+      "entropy": 0.4684673063457012,
+      "epoch": 0.035878564857405704,
+      "frac_reward_zero_std": 0.125,
+      "grad_norm": 0.0018944795010611415,
+      "learning_rate": 1e-05,
+      "loss": 0.1526,
+      "num_tokens": 27445811.0,
+      "reward": 0.5,
+      "reward_std": 0.42081791162490845,
+      "rewards/accuracy_reward/mean": 0.5,
+      "rewards/accuracy_reward/std": 0.5039526224136353,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999194145202637,
+      "sampling/importance_sampling_ratio/min": 0.051326826214790344,
+      "sampling/sampling_logp_difference/max": 2.9695417881011963,
+      "sampling/sampling_logp_difference/mean": 0.017393115907907486,
+      "step": 78
+    },
+    {
+      "clip_ratio/high_max": 0.00025518189613649156,
+      "clip_ratio/high_mean": 7.311715717150946e-05,
+      "clip_ratio/low_mean": 0.0003523219229464303,
+      "clip_ratio/low_min": 4.194631037535146e-05,
+      "clip_ratio/region_mean": 0.0004254390933056129,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 11377.0,
+      "completions/max_terminated_length": 11377.0,
+      "completions/mean_length": 4863.6875,
+      "completions/mean_terminated_length": 4863.6875,
+      "completions/min_length": 23.0,
+      "completions/min_terminated_length": 23.0,
+      "entropy": 0.48279719054698944,
+      "epoch": 0.03633854645814168,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.0036398270167410374,
+      "learning_rate": 1e-05,
+      "loss": 0.0757,
+      "num_tokens": 27768311.0,
+      "reward": 0.5,
+      "reward_std": 0.41034358739852905,
+      "rewards/accuracy_reward/mean": 0.5,
+      "rewards/accuracy_reward/std": 0.5039526224136353,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999462962150574,
+      "sampling/importance_sampling_ratio/min": 0.1956295520067215,
+      "sampling/sampling_logp_difference/max": 1.6315324306488037,
+      "sampling/sampling_logp_difference/mean": 0.017851797863841057,
+      "step": 79
+    },
+    {
+      "clip_ratio/high_max": 8.58052426337963e-05,
+      "clip_ratio/high_mean": 2.783080799417803e-05,
+      "clip_ratio/low_mean": 0.00012623786369658774,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.00015406867260026047,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 13248.0,
+      "completions/mean_length": 4896.609375,
+      "completions/mean_terminated_length": 4526.04833984375,
+      "completions/min_length": 719.0,
+      "completions/min_terminated_length": 719.0,
+      "entropy": 0.4410700872540474,
+      "epoch": 0.03679852805887764,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0013588728616014123,
+      "learning_rate": 1e-05,
+      "loss": 0.0289,
+      "num_tokens": 28090318.0,
+      "reward": 0.5625,
+      "reward_std": 0.29143065214157104,
+      "rewards/accuracy_reward/mean": 0.5625,
+      "rewards/accuracy_reward/std": 0.5,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000934600830078,
+      "sampling/importance_sampling_ratio/min": 0.2560647428035736,
+      "sampling/sampling_logp_difference/max": 1.3827834129333496,
+      "sampling/sampling_logp_difference/mean": 0.01850186660885811,
+      "step": 80
+    },
+    {
+      "clip_ratio/high_max": 0.00012984564500584383,
+      "clip_ratio/high_mean": 4.1093299500971625e-05,
+      "clip_ratio/low_mean": 0.00019706484090420417,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.00023815813938199426,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16324.0,
+      "completions/mean_length": 5411.6875,
+      "completions/mean_terminated_length": 5237.52392578125,
+      "completions/min_length": 363.0,
+      "completions/min_terminated_length": 363.0,
+      "entropy": 0.435161255300045,
+      "epoch": 0.037258509659613616,
+      "frac_reward_zero_std": 0.125,
+      "grad_norm": 0.002362700179219246,
+      "learning_rate": 1e-05,
+      "loss": -0.0162,
+      "num_tokens": 28446506.0,
+      "reward": 0.609375,
+      "reward_std": 0.41185659170150757,
+      "rewards/accuracy_reward/mean": 0.609375,
+      "rewards/accuracy_reward/std": 0.4917473793029785,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999968409538269,
+      "sampling/importance_sampling_ratio/min": 0.11768288910388947,
+      "sampling/sampling_logp_difference/max": 2.1397616863250732,
+      "sampling/sampling_logp_difference/mean": 0.016388364136219025,
+      "step": 81
+    },
+    {
+      "clip_ratio/high_max": 0.00018365592040936463,
+      "clip_ratio/high_mean": 5.955360620646388e-05,
+      "clip_ratio/low_mean": 0.00016669651313350187,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.00022625011933996575,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14620.0,
+      "completions/mean_length": 5077.125,
+      "completions/mean_terminated_length": 4521.048828125,
+      "completions/min_length": 666.0,
+      "completions/min_terminated_length": 666.0,
+      "entropy": 0.390783354640007,
+      "epoch": 0.03771849126034959,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.004699930548667908,
+      "learning_rate": 1e-05,
+      "loss": 0.1469,
+      "num_tokens": 28780506.0,
+      "reward": 0.703125,
+      "reward_std": 0.26621314883232117,
+      "rewards/accuracy_reward/mean": 0.703125,
+      "rewards/accuracy_reward/std": 0.4604927599430084,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000169277191162,
+      "sampling/importance_sampling_ratio/min": 0.260940283536911,
+      "sampling/sampling_logp_difference/max": 1.8336589336395264,
+      "sampling/sampling_logp_difference/mean": 0.014649204909801483,
+      "step": 82
+    },
+    {
+      "clip_ratio/high_max": 0.00018347952845942928,
+      "clip_ratio/high_mean": 6.701854022139742e-05,
+      "clip_ratio/low_mean": 0.0003997059175162576,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.00046672445023432374,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 13433.0,
+      "completions/mean_length": 3479.25,
+      "completions/mean_terminated_length": 3274.412841796875,
+      "completions/min_length": 388.0,
+      "completions/min_terminated_length": 388.0,
+      "entropy": 0.46421004086732864,
+      "epoch": 0.038178472861085555,
+      "frac_reward_zero_std": 0.125,
+      "grad_norm": 0.004662287421524525,
+      "learning_rate": 1e-05,
+      "loss": -0.0959,
+      "num_tokens": 29014730.0,
+      "reward": 0.375,
+      "reward_std": 0.41610968112945557,
+      "rewards/accuracy_reward/mean": 0.375,
+      "rewards/accuracy_reward/std": 0.48795005679130554,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.000164270401001,
+      "sampling/importance_sampling_ratio/min": 0.20871604979038239,
+      "sampling/sampling_logp_difference/max": 1.5667805671691895,
+      "sampling/sampling_logp_difference/mean": 0.018132932484149933,
+      "step": 83
+    },
+    {
+      "clip_ratio/high_max": 0.00015986033577064518,
+      "clip_ratio/high_mean": 5.5160472129500704e-05,
+      "clip_ratio/low_mean": 0.00017546498065712512,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.00023062545551510993,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 10876.0,
+      "completions/max_terminated_length": 10876.0,
+      "completions/mean_length": 5015.3125,
+      "completions/mean_terminated_length": 5015.3125,
+      "completions/min_length": 844.0,
+      "completions/min_terminated_length": 844.0,
+      "entropy": 0.4448152147233486,
+      "epoch": 0.03863845446182153,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.00349896471016109,
+      "learning_rate": 1e-05,
+      "loss": 0.0813,
+      "num_tokens": 29347366.0,
+      "reward": 0.59375,
+      "reward_std": 0.34929439425468445,
+      "rewards/accuracy_reward/mean": 0.59375,
+      "rewards/accuracy_reward/std": 0.49501484632492065,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999954104423523,
+      "sampling/importance_sampling_ratio/min": 0.32092222571372986,
+      "sampling/sampling_logp_difference/max": 1.1365565061569214,
+      "sampling/sampling_logp_difference/mean": 0.017620427533984184,
+      "step": 84
+    },
+    {
+      "clip_ratio/high_max": 0.0001525146399217192,
+      "clip_ratio/high_mean": 4.684553550760029e-05,
+      "clip_ratio/low_mean": 0.0003866927354465588,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0004335382654971909,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15504.0,
+      "completions/max_terminated_length": 15504.0,
+      "completions/mean_length": 4333.453125,
+      "completions/mean_terminated_length": 4333.453125,
+      "completions/min_length": 283.0,
+      "completions/min_terminated_length": 283.0,
+      "entropy": 0.46303874254226685,
+      "epoch": 0.0390984360625575,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0019385392079129815,
+      "learning_rate": 1e-05,
+      "loss": -0.0056,
+      "num_tokens": 29637331.0,
+      "reward": 0.515625,
+      "reward_std": 0.31512534618377686,
+      "rewards/accuracy_reward/mean": 0.515625,
+      "rewards/accuracy_reward/std": 0.5037065148353577,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999468326568604,
+      "sampling/importance_sampling_ratio/min": 0.10085343569517136,
+      "sampling/sampling_logp_difference/max": 2.2940869331359863,
+      "sampling/sampling_logp_difference/mean": 0.017312370240688324,
+      "step": 85
+    },
+    {
+      "clip_ratio/high_max": 9.848577337834286e-05,
+      "clip_ratio/high_mean": 2.7283510007691802e-05,
+      "clip_ratio/low_mean": 0.00015025084576336667,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.00017753436122802668,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 11260.0,
+      "completions/mean_length": 4033.9375,
+      "completions/mean_terminated_length": 3837.905029296875,
+      "completions/min_length": 726.0,
+      "completions/min_terminated_length": 726.0,
+      "entropy": 0.41759752854704857,
+      "epoch": 0.03955841766329347,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0066805374808609486,
+      "learning_rate": 1e-05,
+      "loss": 0.1099,
+      "num_tokens": 29906119.0,
+      "reward": 0.671875,
+      "reward_std": 0.29355230927467346,
+      "rewards/accuracy_reward/mean": 0.671875,
+      "rewards/accuracy_reward/std": 0.4732423722743988,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000640153884888,
+      "sampling/importance_sampling_ratio/min": 0.2963661849498749,
+      "sampling/sampling_logp_difference/max": 1.216159462928772,
+      "sampling/sampling_logp_difference/mean": 0.014013087376952171,
+      "step": 86
+    },
+    {
+      "clip_ratio/high_max": 0.00014968111190682976,
+      "clip_ratio/high_mean": 6.019531133460987e-05,
+      "clip_ratio/low_mean": 0.0001971712508748169,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.00025736656061781105,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 12530.0,
+      "completions/max_terminated_length": 12530.0,
+      "completions/mean_length": 4039.625,
+      "completions/mean_terminated_length": 4039.625,
+      "completions/min_length": 199.0,
+      "completions/min_terminated_length": 199.0,
+      "entropy": 0.4908015578985214,
+      "epoch": 0.04001839926402944,
+      "frac_reward_zero_std": 0.125,
+      "grad_norm": 0.0031810218933969736,
+      "learning_rate": 1e-05,
+      "loss": -0.0429,
+      "num_tokens": 30175247.0,
+      "reward": 0.6875,
+      "reward_std": 0.400318443775177,
+      "rewards/accuracy_reward/mean": 0.6875,
+      "rewards/accuracy_reward/std": 0.467176616191864,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998473525047302,
+      "sampling/importance_sampling_ratio/min": 0.12201575934886932,
+      "sampling/sampling_logp_difference/max": 2.103605031967163,
+      "sampling/sampling_logp_difference/mean": 0.014932084828615189,
+      "step": 87
+    },
+    {
+      "clip_ratio/high_max": 0.0001242801772605162,
+      "clip_ratio/high_mean": 3.107004431512905e-05,
+      "clip_ratio/low_mean": 0.00024339640594917,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.00027446644844530965,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 14343.0,
+      "completions/max_terminated_length": 14343.0,
+      "completions/mean_length": 3642.65625,
+      "completions/mean_terminated_length": 3642.65625,
+      "completions/min_length": 405.0,
+      "completions/min_terminated_length": 405.0,
+      "entropy": 0.497805830091238,
+      "epoch": 0.040478380864765406,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.007662316784262657,
+      "learning_rate": 1e-05,
+      "loss": 0.0203,
+      "num_tokens": 30417265.0,
+      "reward": 0.609375,
+      "reward_std": 0.26196980476379395,
+      "rewards/accuracy_reward/mean": 0.609375,
+      "rewards/accuracy_reward/std": 0.4917473793029785,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9997639060020447,
+      "sampling/importance_sampling_ratio/min": 0.11714459955692291,
+      "sampling/sampling_logp_difference/max": 2.144346237182617,
+      "sampling/sampling_logp_difference/mean": 0.018438715487718582,
+      "step": 88
+    },
+    {
+      "clip_ratio/high_max": 0.00012720484210149152,
+      "clip_ratio/high_mean": 5.5490041631856e-05,
+      "clip_ratio/low_mean": 0.0003174601497448748,
+      "clip_ratio/low_min": 1.4323364666779526e-05,
+      "clip_ratio/region_mean": 0.0003729501986526884,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16234.0,
+      "completions/mean_length": 7055.640625,
+      "completions/mean_terminated_length": 6596.86865234375,
+      "completions/min_length": 1407.0,
+      "completions/min_terminated_length": 1407.0,
+      "entropy": 0.4791577495634556,
+      "epoch": 0.04093836246550138,
+      "frac_reward_zero_std": 0.125,
+      "grad_norm": 0.0010236409725621343,
+      "learning_rate": 1e-05,
+      "loss": 0.0621,
+      "num_tokens": 30879706.0,
+      "reward": 0.5,
+      "reward_std": 0.3230288028717041,
+      "rewards/accuracy_reward/mean": 0.5,
+      "rewards/accuracy_reward/std": 0.5039526224136353,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999570250511169,
+      "sampling/importance_sampling_ratio/min": 0.1733396202325821,
+      "sampling/sampling_logp_difference/max": 1.75250244140625,
+      "sampling/sampling_logp_difference/mean": 0.017151571810245514,
+      "step": 89
+    },
+    {
+      "clip_ratio/high_max": 0.00010716462838900043,
+      "clip_ratio/high_mean": 3.53956390881649e-05,
+      "clip_ratio/low_mean": 0.0002695056762149761,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.00030490130939142546,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16087.0,
+      "completions/mean_length": 6766.734375,
+      "completions/mean_terminated_length": 6125.58349609375,
+      "completions/min_length": 652.0,
+      "completions/min_terminated_length": 652.0,
+      "entropy": 0.5393588915467262,
+      "epoch": 0.04139834406623735,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0016526335384696722,
+      "learning_rate": 1e-05,
+      "loss": 0.0128,
+      "num_tokens": 31324809.0,
+      "reward": 0.375,
+      "reward_std": 0.342454731464386,
+      "rewards/accuracy_reward/mean": 0.375,
+      "rewards/accuracy_reward/std": 0.48795005679130554,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000569820404053,
+      "sampling/importance_sampling_ratio/min": 0.009934165515005589,
+      "sampling/sampling_logp_difference/max": 4.6117753982543945,
+      "sampling/sampling_logp_difference/mean": 0.018986130133271217,
+      "step": 90
+    },
+    {
+      "clip_ratio/high_max": 7.089101882229443e-05,
+      "clip_ratio/high_mean": 2.2431363390751358e-05,
+      "clip_ratio/low_mean": 0.00013420935329122585,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.00015664071861465345,
+      "completions/clipped_ratio": 0.109375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15215.0,
+      "completions/mean_length": 6582.734375,
+      "completions/mean_terminated_length": 5379.0703125,
+      "completions/min_length": 750.0,
+      "completions/min_terminated_length": 750.0,
+      "entropy": 0.5493632070720196,
+      "epoch": 0.04185832566697332,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0012266625417396426,
+      "learning_rate": 1e-05,
+      "loss": -0.0121,
+      "num_tokens": 31754640.0,
+      "reward": 0.515625,
+      "reward_std": 0.3266732692718506,
+      "rewards/accuracy_reward/mean": 0.515625,
+      "rewards/accuracy_reward/std": 0.5037065148353577,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999552965164185,
+      "sampling/importance_sampling_ratio/min": 0.026827236637473106,
+      "sampling/sampling_logp_difference/max": 3.618337631225586,
+      "sampling/sampling_logp_difference/mean": 0.01922820881009102,
+      "step": 91
+    },
+    {
+      "clip_ratio/high_max": 0.00015220932891679695,
+      "clip_ratio/high_mean": 5.230503052189306e-05,
+      "clip_ratio/low_mean": 0.00028057711733708857,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0003328821453578712,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 14355.0,
+      "completions/max_terminated_length": 14355.0,
+      "completions/mean_length": 4673.609375,
+      "completions/mean_terminated_length": 4673.609375,
+      "completions/min_length": 197.0,
+      "completions/min_terminated_length": 197.0,
+      "entropy": 0.37891076132655144,
+      "epoch": 0.04231830726770929,
+      "frac_reward_zero_std": 0.125,
+      "grad_norm": 0.0038514919579029083,
+      "learning_rate": 1e-05,
+      "loss": -0.0397,
+      "num_tokens": 32063375.0,
+      "reward": 0.671875,
+      "reward_std": 0.3908922076225281,
+      "rewards/accuracy_reward/mean": 0.671875,
+      "rewards/accuracy_reward/std": 0.4732423722743988,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999393224716187,
+      "sampling/importance_sampling_ratio/min": 0.20188941061496735,
+      "sampling/sampling_logp_difference/max": 1.6000351905822754,
+      "sampling/sampling_logp_difference/mean": 0.014766812324523926,
+      "step": 92
+    },
+    {
+      "clip_ratio/high_max": 0.00011865788155773771,
+      "clip_ratio/high_mean": 4.490372168675094e-05,
+      "clip_ratio/low_mean": 0.00023933520606078673,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.00028423893309081905,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 10753.0,
+      "completions/max_terminated_length": 10753.0,
+      "completions/mean_length": 4133.34375,
+      "completions/mean_terminated_length": 4133.34375,
+      "completions/min_length": 492.0,
+      "completions/min_terminated_length": 492.0,
+      "entropy": 0.5132806189358234,
+      "epoch": 0.042778288868445265,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0014511283952742815,
+      "learning_rate": 1e-05,
+      "loss": 0.016,
+      "num_tokens": 32347861.0,
+      "reward": 0.40625,
+      "reward_std": 0.3061639666557312,
+      "rewards/accuracy_reward/mean": 0.40625,
+      "rewards/accuracy_reward/std": 0.49501484632492065,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999454021453857,
+      "sampling/importance_sampling_ratio/min": 0.2521326541900635,
+      "sampling/sampling_logp_difference/max": 1.3777999877929688,
+      "sampling/sampling_logp_difference/mean": 0.017015758901834488,
+      "step": 93
+    },
+    {
+      "clip_ratio/high_max": 0.00018065326321448083,
+      "clip_ratio/high_mean": 6.95563029466939e-05,
+      "clip_ratio/low_mean": 0.00028433307852537837,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0003538893797667697,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15960.0,
+      "completions/mean_length": 6026.265625,
+      "completions/mean_terminated_length": 5516.86865234375,
+      "completions/min_length": 873.0,
+      "completions/min_terminated_length": 873.0,
+      "entropy": 0.42576640471816063,
+      "epoch": 0.04323827046918123,
+      "frac_reward_zero_std": 0.125,
+      "grad_norm": 0.002165265381336212,
+      "learning_rate": 1e-05,
+      "loss": 0.0424,
+      "num_tokens": 32742870.0,
+      "reward": 0.609375,
+      "reward_std": 0.4523906111717224,
+      "rewards/accuracy_reward/mean": 0.609375,
+      "rewards/accuracy_reward/std": 0.4917473793029785,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000587701797485,
+      "sampling/importance_sampling_ratio/min": 0.011113330721855164,
+      "sampling/sampling_logp_difference/max": 4.49960994720459,
+      "sampling/sampling_logp_difference/mean": 0.01895231008529663,
+      "step": 94
+    },
+    {
+      "clip_ratio/high_max": 7.521962379541947e-05,
+      "clip_ratio/high_mean": 2.5767211354832398e-05,
+      "clip_ratio/low_mean": 0.0003009975771419704,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.00032676478986104485,
+      "completions/clipped_ratio": 0.109375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15803.0,
+      "completions/mean_length": 6262.4375,
+      "completions/mean_terminated_length": 5019.4384765625,
+      "completions/min_length": 429.0,
+      "completions/min_terminated_length": 429.0,
+      "entropy": 0.3823938798159361,
+      "epoch": 0.043698252069917204,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0017652231035754085,
+      "learning_rate": 1e-05,
+      "loss": -0.0391,
+      "num_tokens": 33152578.0,
+      "reward": 0.640625,
+      "reward_std": 0.31512534618377686,
+      "rewards/accuracy_reward/mean": 0.640625,
+      "rewards/accuracy_reward/std": 0.4836103618144989,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000849962234497,
+      "sampling/importance_sampling_ratio/min": 0.08265355229377747,
+      "sampling/sampling_logp_difference/max": 2.4930975437164307,
+      "sampling/sampling_logp_difference/mean": 0.015054848976433277,
+      "step": 95
+    },
+    {
+      "clip_ratio/high_max": 0.00019391980549698928,
+      "clip_ratio/high_mean": 5.29239216575661e-05,
+      "clip_ratio/low_mean": 0.00014883351195749128,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0002017574342971784,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 13824.0,
+      "completions/max_terminated_length": 13824.0,
+      "completions/mean_length": 3582.4375,
+      "completions/mean_terminated_length": 3582.4375,
+      "completions/min_length": 787.0,
+      "completions/min_terminated_length": 787.0,
+      "entropy": 0.42068246752023697,
+      "epoch": 0.04415823367065318,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.001477578654885292,
+      "learning_rate": 1e-05,
+      "loss": 0.1755,
+      "num_tokens": 33390510.0,
+      "reward": 0.796875,
+      "reward_std": 0.3403330445289612,
+      "rewards/accuracy_reward/mean": 0.796875,
+      "rewards/accuracy_reward/std": 0.40550529956817627,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000420808792114,
+      "sampling/importance_sampling_ratio/min": 0.20339767634868622,
+      "sampling/sampling_logp_difference/max": 1.5925922393798828,
+      "sampling/sampling_logp_difference/mean": 0.013980602845549583,
+      "step": 96
+    },
+    {
+      "clip_ratio/high_max": 0.0001818037626435398,
+      "clip_ratio/high_mean": 5.415482519310899e-05,
+      "clip_ratio/low_mean": 0.00012345622963039204,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.00017761105391400633,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15258.0,
+      "completions/mean_length": 4848.875,
+      "completions/mean_terminated_length": 4476.77392578125,
+      "completions/min_length": 951.0,
+      "completions/min_terminated_length": 951.0,
+      "entropy": 0.3348248451948166,
+      "epoch": 0.04461821527138914,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0017093514325097203,
+      "learning_rate": 1e-05,
+      "loss": 0.119,
+      "num_tokens": 33711878.0,
+      "reward": 0.703125,
+      "reward_std": 0.28930896520614624,
+      "rewards/accuracy_reward/mean": 0.703125,
+      "rewards/accuracy_reward/std": 0.4604927599430084,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999937117099762,
+      "sampling/importance_sampling_ratio/min": 0.08507421612739563,
+      "sampling/sampling_logp_difference/max": 2.464231252670288,
+      "sampling/sampling_logp_difference/mean": 0.013996141962707043,
+      "step": 97
+    },
+    {
+      "clip_ratio/high_max": 8.166239786078222e-05,
+      "clip_ratio/high_mean": 3.0598509965784615e-05,
+      "clip_ratio/low_mean": 0.0001227793386533449,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.00015337784543589805,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 12150.0,
+      "completions/max_terminated_length": 12150.0,
+      "completions/mean_length": 3608.53125,
+      "completions/mean_terminated_length": 3608.53125,
+      "completions/min_length": 743.0,
+      "completions/min_terminated_length": 743.0,
+      "entropy": 0.4186965227127075,
+      "epoch": 0.045078196872125116,
+      "frac_reward_zero_std": 0.125,
+      "grad_norm": 0.0035578685346990824,
+      "learning_rate": 1e-05,
+      "loss": -0.02,
+      "num_tokens": 33952336.0,
+      "reward": 0.734375,
+      "reward_std": 0.41398805379867554,
+      "rewards/accuracy_reward/mean": 0.734375,
+      "rewards/accuracy_reward/std": 0.44515693187713623,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000457763671875,
+      "sampling/importance_sampling_ratio/min": 0.21771050989627838,
+      "sampling/sampling_logp_difference/max": 1.8422369956970215,
+      "sampling/sampling_logp_difference/mean": 0.015421013347804546,
+      "step": 98
+    },
+    {
+      "clip_ratio/high_max": 0.0002565569302532822,
+      "clip_ratio/high_mean": 7.735242525086505e-05,
+      "clip_ratio/low_mean": 0.00022900168551132083,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0003063541153096594,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 16100.0,
+      "completions/max_terminated_length": 16100.0,
+      "completions/mean_length": 5686.828125,
+      "completions/mean_terminated_length": 5686.828125,
+      "completions/min_length": 497.0,
+      "completions/min_terminated_length": 497.0,
+      "entropy": 0.3955523520708084,
+      "epoch": 0.04553817847286108,
+      "frac_reward_zero_std": 0.125,
+      "grad_norm": 0.0015768167795613408,
+      "learning_rate": 1e-05,
+      "loss": -0.0626,
+      "num_tokens": 34325829.0,
+      "reward": 0.5,
+      "reward_std": 0.3682710528373718,
+      "rewards/accuracy_reward/mean": 0.5,
+      "rewards/accuracy_reward/std": 0.5039526224136353,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000638961791992,
+      "sampling/importance_sampling_ratio/min": 0.1319275200366974,
+      "sampling/sampling_logp_difference/max": 2.0255026817321777,
+      "sampling/sampling_logp_difference/mean": 0.01693328656256199,
+      "step": 99
+    },
+    {
+      "clip_ratio/high_max": 0.00017156526701000985,
+      "clip_ratio/high_mean": 4.4765379698219476e-05,
+      "clip_ratio/low_mean": 0.00013393372819336946,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.00017869910379886278,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 13278.0,
+      "completions/max_terminated_length": 13278.0,
+      "completions/mean_length": 4955.796875,
+      "completions/mean_terminated_length": 4955.796875,
+      "completions/min_length": 410.0,
+      "completions/min_terminated_length": 410.0,
+      "entropy": 0.41905970498919487,
+      "epoch": 0.045998160073597055,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0016299523413181305,
+      "learning_rate": 1e-05,
+      "loss": -0.0648,
+      "num_tokens": 34650848.0,
+      "reward": 0.578125,
+      "reward_std": 0.35612428188323975,
+      "rewards/accuracy_reward/mean": 0.578125,
+      "rewards/accuracy_reward/std": 0.49776285886764526,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999072551727295,
+      "sampling/importance_sampling_ratio/min": 0.27319714426994324,
+      "sampling/sampling_logp_difference/max": 1.2975616455078125,
+      "sampling/sampling_logp_difference/mean": 0.016213715076446533,
+      "step": 100
+    },
+    {
+      "clip_ratio/high_max": 0.000201842942260555,
+      "clip_ratio/high_mean": 8.582275131630013e-05,
+      "clip_ratio/low_mean": 4.658012494473951e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.00013240287626103964,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16284.0,
+      "completions/mean_length": 7215.21875,
+      "completions/mean_terminated_length": 7069.68310546875,
+      "completions/min_length": 1205.0,
+      "completions/min_terminated_length": 1205.0,
+      "entropy": 0.41243599355220795,
+      "epoch": 0.04645814167433303,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.003915950655937195,
+      "learning_rate": 1e-05,
+      "loss": 0.0245,
+      "num_tokens": 35123918.0,
+      "reward": 0.65625,
+      "reward_std": 0.2346404492855072,
+      "rewards/accuracy_reward/mean": 0.65625,
+      "rewards/accuracy_reward/std": 0.4787135720252991,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000272989273071,
+      "sampling/importance_sampling_ratio/min": 6.35706098872646e-10,
+      "sampling/sampling_logp_difference/max": 21.176284790039062,
+      "sampling/sampling_logp_difference/mean": 0.01628049463033676,
+      "step": 101
+    },
+    {
+      "clip_ratio/high_max": 0.00018997467668668833,
+      "clip_ratio/high_mean": 7.346466600210988e-05,
+      "clip_ratio/low_mean": 0.00024571850167376397,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.00031918316017254256,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14416.0,
+      "completions/mean_length": 4967.4375,
+      "completions/mean_terminated_length": 4599.1611328125,
+      "completions/min_length": 258.0,
+      "completions/min_terminated_length": 258.0,
+      "entropy": 0.43091630935668945,
+      "epoch": 0.046918123275068994,
+      "frac_reward_zero_std": 0.125,
+      "grad_norm": 0.0014639816945418715,
+      "learning_rate": 1e-05,
+      "loss": 0.0304,
+      "num_tokens": 35454658.0,
+      "reward": 0.5,
+      "reward_std": 0.3777071237564087,
+      "rewards/accuracy_reward/mean": 0.5,
+      "rewards/accuracy_reward/std": 0.5039526224136353,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000555515289307,
+      "sampling/importance_sampling_ratio/min": 0.2935287654399872,
+      "sampling/sampling_logp_difference/max": 1.4500041007995605,
+      "sampling/sampling_logp_difference/mean": 0.016430124640464783,
+      "step": 102
+    },
+    {
+      "clip_ratio/high_max": 0.00027447581123851705,
+      "clip_ratio/high_mean": 7.783462342558778e-05,
+      "clip_ratio/low_mean": 0.00025762664154171944,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.00033546126724104397,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 14372.0,
+      "completions/max_terminated_length": 14372.0,
+      "completions/mean_length": 4758.21875,
+      "completions/mean_terminated_length": 4758.21875,
+      "completions/min_length": 1026.0,
+      "completions/min_terminated_length": 1026.0,
+      "entropy": 0.5072713866829872,
+      "epoch": 0.04737810487580497,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0027890228666365147,
+      "learning_rate": 1e-05,
+      "loss": -0.002,
+      "num_tokens": 35771336.0,
+      "reward": 0.53125,
+      "reward_std": 0.3029785752296448,
+      "rewards/accuracy_reward/mean": 0.53125,
+      "rewards/accuracy_reward/std": 0.5029674172401428,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000927448272705,
+      "sampling/importance_sampling_ratio/min": 0.13961824774742126,
+      "sampling/sampling_logp_difference/max": 1.9688433408737183,
+      "sampling/sampling_logp_difference/mean": 0.017496878281235695,
+      "step": 103
+    },
+    {
+      "clip_ratio/high_max": 0.00013204321567172883,
+      "clip_ratio/high_mean": 3.5426355907475227e-05,
+      "clip_ratio/low_mean": 0.00023678694105910836,
+      "clip_ratio/low_min": 3.282993930042721e-05,
+      "clip_ratio/region_mean": 0.0002722132940107258,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 13356.0,
+      "completions/max_terminated_length": 13356.0,
+      "completions/mean_length": 4473.171875,
+      "completions/mean_terminated_length": 4473.171875,
+      "completions/min_length": 681.0,
+      "completions/min_terminated_length": 681.0,
+      "entropy": 0.5951492674648762,
+      "epoch": 0.04783808647654094,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.004909890703856945,
+      "learning_rate": 1e-05,
+      "loss": 0.0509,
+      "num_tokens": 36068339.0,
+      "reward": 0.515625,
+      "reward_std": 0.3492845892906189,
+      "rewards/accuracy_reward/mean": 0.515625,
+      "rewards/accuracy_reward/std": 0.5037065148353577,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999641180038452,
+      "sampling/importance_sampling_ratio/min": 0.18714448809623718,
+      "sampling/sampling_logp_difference/max": 1.6758742332458496,
+      "sampling/sampling_logp_difference/mean": 0.01959427446126938,
+      "step": 104
+    },
+    {
+      "clip_ratio/high_max": 0.00016260610209428705,
+      "clip_ratio/high_mean": 5.445963370220852e-05,
+      "clip_ratio/low_mean": 0.00027578835397434887,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.00033024798904079944,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15313.0,
+      "completions/mean_length": 5181.5625,
+      "completions/mean_terminated_length": 4630.62255859375,
+      "completions/min_length": 191.0,
+      "completions/min_terminated_length": 191.0,
+      "entropy": 0.47477902472019196,
+      "epoch": 0.04829806807727691,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0025038071908056736,
+      "learning_rate": 1e-05,
+      "loss": 0.0722,
+      "num_tokens": 36409575.0,
+      "reward": 0.453125,
+      "reward_std": 0.36507582664489746,
+      "rewards/accuracy_reward/mean": 0.453125,
+      "rewards/accuracy_reward/std": 0.501733124256134,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000419616699219,
+      "sampling/importance_sampling_ratio/min": 0.01011443231254816,
+      "sampling/sampling_logp_difference/max": 4.593791961669922,
+      "sampling/sampling_logp_difference/mean": 0.017458593472838402,
+      "step": 105
+    },
+    {
+      "clip_ratio/high_max": 8.053758392634336e-05,
+      "clip_ratio/high_mean": 3.110795205429895e-05,
+      "clip_ratio/low_mean": 0.0005240299615252297,
+      "clip_ratio/low_min": 6.53458118904382e-06,
+      "clip_ratio/region_mean": 0.0005551379072130658,
+      "completions/clipped_ratio": 0.15625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16081.0,
+      "completions/mean_length": 8749.125,
+      "completions/mean_terminated_length": 7335.25927734375,
+      "completions/min_length": 997.0,
+      "completions/min_terminated_length": 997.0,
+      "entropy": 0.586535070091486,
+      "epoch": 0.04875804967801288,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.003648218931630254,
+      "learning_rate": 1e-05,
+      "loss": 0.0214,
+      "num_tokens": 36979055.0,
+      "reward": 0.421875,
+      "reward_std": 0.2993341088294983,
+      "rewards/accuracy_reward/mean": 0.421875,
+      "rewards/accuracy_reward/std": 0.49776285886764526,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999933242797852,
+      "sampling/importance_sampling_ratio/min": 0.0010058816988021135,
+      "sampling/sampling_logp_difference/max": 6.901890754699707,
+      "sampling/sampling_logp_difference/mean": 0.023093216121196747,
+      "step": 106
+    },
+    {
+      "clip_ratio/high_max": 0.00013844405839336105,
+      "clip_ratio/high_mean": 4.615002399077639e-05,
+      "clip_ratio/low_mean": 0.0001350231077594799,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.00018117312947651953,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15589.0,
+      "completions/mean_length": 6056.921875,
+      "completions/mean_terminated_length": 5368.4501953125,
+      "completions/min_length": 414.0,
+      "completions/min_terminated_length": 414.0,
+      "entropy": 0.4401419050991535,
+      "epoch": 0.04921803127874885,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0014414339093491435,
+      "learning_rate": 1e-05,
+      "loss": -0.0276,
+      "num_tokens": 37376482.0,
+      "reward": 0.34375,
+      "reward_std": 0.3072218894958496,
+      "rewards/accuracy_reward/mean": 0.34375,
+      "rewards/accuracy_reward/std": 0.4787135720252991,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.000091314315796,
+      "sampling/importance_sampling_ratio/min": 0.14452841877937317,
+      "sampling/sampling_logp_difference/max": 1.934279203414917,
+      "sampling/sampling_logp_difference/mean": 0.017904866486787796,
+      "step": 107
+    },
+    {
+      "clip_ratio/high_max": 0.0001955404550244566,
+      "clip_ratio/high_mean": 7.653925149497809e-05,
+      "clip_ratio/low_mean": 0.0002893621494877152,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0003659013982542092,
+      "completions/clipped_ratio": 0.078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 13655.0,
+      "completions/mean_length": 4545.0,
+      "completions/mean_terminated_length": 3541.69482421875,
+      "completions/min_length": 691.0,
+      "completions/min_terminated_length": 691.0,
+      "entropy": 0.372543640434742,
+      "epoch": 0.04967801287948482,
+      "frac_reward_zero_std": 0.125,
+      "grad_norm": 0.0034049772657454014,
+      "learning_rate": 1e-05,
+      "loss": 0.1294,
+      "num_tokens": 37675834.0,
+      "reward": 0.6875,
+      "reward_std": 0.39347875118255615,
+      "rewards/accuracy_reward/mean": 0.6875,
+      "rewards/accuracy_reward/std": 0.467176616191864,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000131130218506,
+      "sampling/importance_sampling_ratio/min": 3.0434759537456557e-05,
+      "sampling/sampling_logp_difference/max": 10.399925231933594,
+      "sampling/sampling_logp_difference/mean": 0.014691833406686783,
+      "step": 108
+    },
+    {
+      "clip_ratio/high_max": 0.00012398830403981265,
+      "clip_ratio/high_mean": 3.488012771413196e-05,
+      "clip_ratio/low_mean": 0.00017011856152748805,
+      "clip_ratio/low_min": 7.710813406447414e-06,
+      "clip_ratio/region_mean": 0.00020499869060586207,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16184.0,
+      "completions/mean_length": 5124.203125,
+      "completions/mean_terminated_length": 4945.4765625,
+      "completions/min_length": 693.0,
+      "completions/min_terminated_length": 693.0,
+      "entropy": 0.40799567475914955,
+      "epoch": 0.05013799448022079,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0021821665577590466,
+      "learning_rate": 1e-05,
+      "loss": 0.0248,
+      "num_tokens": 38013495.0,
+      "reward": 0.515625,
+      "reward_std": 0.3356248140335083,
+      "rewards/accuracy_reward/mean": 0.515625,
+      "rewards/accuracy_reward/std": 0.5037065148353577,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999369978904724,
+      "sampling/importance_sampling_ratio/min": 0.04143543541431427,
+      "sampling/sampling_logp_difference/max": 3.1836187839508057,
+      "sampling/sampling_logp_difference/mean": 0.015723641961812973,
+      "step": 109
+    },
+    {
+      "clip_ratio/high_max": 0.00011402473955968162,
+      "clip_ratio/high_mean": 2.8506184889920405e-05,
+      "clip_ratio/low_mean": 0.00014105440459388774,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.00016956058880168712,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 10540.0,
+      "completions/mean_length": 3167.21875,
+      "completions/mean_terminated_length": 2957.4287109375,
+      "completions/min_length": 628.0,
+      "completions/min_terminated_length": 628.0,
+      "entropy": 0.4380917586386204,
+      "epoch": 0.050597976080956765,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.003855443326756358,
+      "learning_rate": 1e-05,
+      "loss": 0.1642,
+      "num_tokens": 38225933.0,
+      "reward": 0.640625,
+      "reward_std": 0.48551157116889954,
+      "rewards/accuracy_reward/mean": 0.640625,
+      "rewards/accuracy_reward/std": 0.4836103618144989,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000114440917969,
+      "sampling/importance_sampling_ratio/min": 0.07347333431243896,
+      "sampling/sampling_logp_difference/max": 2.610832691192627,
+      "sampling/sampling_logp_difference/mean": 0.014714892953634262,
+      "step": 110
+    },
+    {
+      "clip_ratio/high_max": 0.00015438527952937875,
+      "clip_ratio/high_mean": 4.432886225913535e-05,
+      "clip_ratio/low_mean": 0.0001313946268055588,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0001757234877004521,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15641.0,
+      "completions/max_terminated_length": 15641.0,
+      "completions/mean_length": 3063.59375,
+      "completions/mean_terminated_length": 3063.59375,
+      "completions/min_length": 509.0,
+      "completions/min_terminated_length": 509.0,
+      "entropy": 0.3451516814529896,
+      "epoch": 0.05105795768169273,
+      "frac_reward_zero_std": 0.125,
+      "grad_norm": 0.0017416629707440734,
+      "learning_rate": 1e-05,
+      "loss": 0.0888,
+      "num_tokens": 38429819.0,
+      "reward": 0.5625,
+      "reward_std": 0.4139782190322876,
+      "rewards/accuracy_reward/mean": 0.5625,
+      "rewards/accuracy_reward/std": 0.5,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999927282333374,
+      "sampling/importance_sampling_ratio/min": 0.13355965912342072,
+      "sampling/sampling_logp_difference/max": 2.013206958770752,
+      "sampling/sampling_logp_difference/mean": 0.01309503149241209,
+      "step": 111
+    },
+    {
+      "clip_ratio/high_max": 0.00011567322871997021,
+      "clip_ratio/high_mean": 4.967931909050094e-05,
+      "clip_ratio/low_mean": 0.00022050612915336387,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0002701854518818436,
+      "completions/clipped_ratio": 0.09375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16212.0,
+      "completions/mean_length": 8586.265625,
+      "completions/mean_terminated_length": 7784.27587890625,
+      "completions/min_length": 938.0,
+      "completions/min_terminated_length": 938.0,
+      "entropy": 0.4284644089639187,
+      "epoch": 0.051517939282428704,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0018239191267639399,
+      "learning_rate": 1e-05,
+      "loss": 0.1554,
+      "num_tokens": 38995164.0,
+      "reward": 0.5625,
+      "reward_std": 0.3924052119255066,
+      "rewards/accuracy_reward/mean": 0.5625,
+      "rewards/accuracy_reward/std": 0.5,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999877393245697,
+      "sampling/importance_sampling_ratio/min": 0.008349776268005371,
+      "sampling/sampling_logp_difference/max": 4.785520553588867,
+      "sampling/sampling_logp_difference/mean": 0.018177181482315063,
+      "step": 112
+    },
+    {
+      "clip_ratio/high_max": 0.00010434241585244308,
+      "clip_ratio/high_mean": 3.0345908612616768e-05,
+      "clip_ratio/low_mean": 0.0002064375662484963,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.000236783477703284,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16191.0,
+      "completions/mean_length": 5793.140625,
+      "completions/mean_terminated_length": 5625.0322265625,
+      "completions/min_length": 826.0,
+      "completions/min_terminated_length": 826.0,
+      "entropy": 0.4497801251709461,
+      "epoch": 0.05197792088316467,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0024066369514912367,
+      "learning_rate": 1e-05,
+      "loss": 0.0793,
+      "num_tokens": 39376541.0,
+      "reward": 0.515625,
+      "reward_std": 0.29826053977012634,
+      "rewards/accuracy_reward/mean": 0.515625,
+      "rewards/accuracy_reward/std": 0.5037065148353577,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000128746032715,
+      "sampling/importance_sampling_ratio/min": 0.2231399565935135,
+      "sampling/sampling_logp_difference/max": 1.4999561309814453,
+      "sampling/sampling_logp_difference/mean": 0.01722925715148449,
+      "step": 113
+    },
+    {
+      "clip_ratio/high_max": 0.000142209177283803,
+      "clip_ratio/high_mean": 3.741042246474535e-05,
+      "clip_ratio/low_mean": 8.439288603767636e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.00012180330850242171,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16300.0,
+      "completions/mean_length": 6118.15625,
+      "completions/mean_terminated_length": 5955.20654296875,
+      "completions/min_length": 202.0,
+      "completions/min_terminated_length": 202.0,
+      "entropy": 0.4755205847322941,
+      "epoch": 0.05243790248390064,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0018325611017644405,
+      "learning_rate": 1e-05,
+      "loss": -0.0845,
+      "num_tokens": 39776703.0,
+      "reward": 0.46875,
+      "reward_std": 0.29143065214157104,
+      "rewards/accuracy_reward/mean": 0.46875,
+      "rewards/accuracy_reward/std": 0.5029674172401428,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999382495880127,
+      "sampling/importance_sampling_ratio/min": 0.11108703166246414,
+      "sampling/sampling_logp_difference/max": 2.197441339492798,
+      "sampling/sampling_logp_difference/mean": 0.016467180103063583,
+      "step": 114
+    },
+    {
+      "clip_ratio/high_max": 0.00013539696647058008,
+      "clip_ratio/high_mean": 4.4776959612136125e-05,
+      "clip_ratio/low_mean": 0.0001621112608063413,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.00020688822041847743,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 13185.0,
+      "completions/max_terminated_length": 13185.0,
+      "completions/mean_length": 4067.65625,
+      "completions/mean_terminated_length": 4067.65625,
+      "completions/min_length": 665.0,
+      "completions/min_terminated_length": 665.0,
+      "entropy": 0.4728252850472927,
+      "epoch": 0.052897884084636616,
+      "frac_reward_zero_std": 0.125,
+      "grad_norm": 0.0028504086658358574,
+      "learning_rate": 1e-05,
+      "loss": -0.0113,
+      "num_tokens": 40046041.0,
+      "reward": 0.578125,
+      "reward_std": 0.4050365090370178,
+      "rewards/accuracy_reward/mean": 0.578125,
+      "rewards/accuracy_reward/std": 0.49776285886764526,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000386238098145,
+      "sampling/importance_sampling_ratio/min": 0.2996428310871124,
+      "sampling/sampling_logp_difference/max": 1.2051640748977661,
+      "sampling/sampling_logp_difference/mean": 0.01653115823864937,
+      "step": 115
+    },
+    {
+      "clip_ratio/high_max": 0.00016086296363937436,
+      "clip_ratio/high_mean": 5.132767250870529e-05,
+      "clip_ratio/low_mean": 8.466833241982386e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.00013599600742963958,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 9704.0,
+      "completions/max_terminated_length": 9704.0,
+      "completions/mean_length": 3261.0,
+      "completions/mean_terminated_length": 3261.0,
+      "completions/min_length": 745.0,
+      "completions/min_terminated_length": 745.0,
+      "entropy": 0.5103091672062874,
+      "epoch": 0.05335786568537258,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.004390237387269735,
+      "learning_rate": 1e-05,
+      "loss": -0.0547,
+      "num_tokens": 40263945.0,
+      "reward": 0.6875,
+      "reward_std": 0.34034284949302673,
+      "rewards/accuracy_reward/mean": 0.6875,
+      "rewards/accuracy_reward/std": 0.467176616191864,
+      "sampling/importance_sampling_ratio/max": 1.7932543754577637,
+      "sampling/importance_sampling_ratio/mean": 0.9998769760131836,
+      "sampling/importance_sampling_ratio/min": 0.0008851143647916615,
+      "sampling/sampling_logp_difference/max": 7.029793739318848,
+      "sampling/sampling_logp_difference/mean": 0.017080796882510185,
+      "step": 116
+    },
+    {
+      "clip_ratio/high_max": 6.713680249958998e-05,
+      "clip_ratio/high_mean": 1.6784200624897494e-05,
+      "clip_ratio/low_mean": 0.00023034057926452078,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.00024712477647881315,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 13991.0,
+      "completions/mean_length": 5899.171875,
+      "completions/mean_terminated_length": 5200.18359375,
+      "completions/min_length": 442.0,
+      "completions/min_terminated_length": 442.0,
+      "entropy": 0.5374041832983494,
+      "epoch": 0.053817847286108556,
+      "frac_reward_zero_std": 0.625,
+      "grad_norm": 0.0015930214431136847,
+      "learning_rate": 1e-05,
+      "loss": 0.0702,
+      "num_tokens": 40650964.0,
+      "reward": 0.578125,
+      "reward_std": 0.1530819982290268,
+      "rewards/accuracy_reward/mean": 0.578125,
+      "rewards/accuracy_reward/std": 0.49776285886764526,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9997994303703308,
+      "sampling/importance_sampling_ratio/min": 0.25812914967536926,
+      "sampling/sampling_logp_difference/max": 1.354295253753662,
+      "sampling/sampling_logp_difference/mean": 0.020320815965533257,
+      "step": 117
+    },
+    {
+      "clip_ratio/high_max": 0.00013986208477945183,
+      "clip_ratio/high_mean": 4.5305262233341637e-05,
+      "clip_ratio/low_mean": 0.00014710804316564463,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.00019241330755903618,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 12004.0,
+      "completions/max_terminated_length": 12004.0,
+      "completions/mean_length": 3367.328125,
+      "completions/mean_terminated_length": 3367.328125,
+      "completions/min_length": 355.0,
+      "completions/min_terminated_length": 355.0,
+      "entropy": 0.4309644438326359,
+      "epoch": 0.05427782888684453,
+      "frac_reward_zero_std": 0.125,
+      "grad_norm": 0.0029529735911637545,
+      "learning_rate": 1e-05,
+      "loss": 0.0401,
+      "num_tokens": 40874649.0,
+      "reward": 0.65625,
+      "reward_std": 0.4050266742706299,
+      "rewards/accuracy_reward/mean": 0.65625,
+      "rewards/accuracy_reward/std": 0.4787135720252991,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999914824962616,
+      "sampling/importance_sampling_ratio/min": 0.5185436010360718,
+      "sampling/sampling_logp_difference/max": 0.8470115661621094,
+      "sampling/sampling_logp_difference/mean": 0.014474974945187569,
+      "step": 118
+    },
+    {
+      "clip_ratio/high_max": 0.00012979303846805124,
+      "clip_ratio/high_mean": 4.9982098062173463e-05,
+      "clip_ratio/low_mean": 0.00030595043153880397,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0003559325195965357,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15999.0,
+      "completions/mean_length": 6069.640625,
+      "completions/mean_terminated_length": 5905.9208984375,
+      "completions/min_length": 1211.0,
+      "completions/min_terminated_length": 1211.0,
+      "entropy": 0.569359052926302,
+      "epoch": 0.054737810487580495,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.002493771258741617,
+      "learning_rate": 1e-05,
+      "loss": -0.0114,
+      "num_tokens": 41272114.0,
+      "reward": 0.46875,
+      "reward_std": 0.4082317352294922,
+      "rewards/accuracy_reward/mean": 0.46875,
+      "rewards/accuracy_reward/std": 0.5029674172401428,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000840425491333,
+      "sampling/importance_sampling_ratio/min": 0.00010690245835576206,
+      "sampling/sampling_logp_difference/max": 9.143593788146973,
+      "sampling/sampling_logp_difference/mean": 0.019345756620168686,
+      "step": 119
+    },
+    {
+      "clip_ratio/high_max": 0.0002575382823124528,
+      "clip_ratio/high_mean": 8.639247698738473e-05,
+      "clip_ratio/low_mean": 0.00022337435802910477,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0003097668359259842,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14922.0,
+      "completions/mean_length": 4214.5625,
+      "completions/mean_terminated_length": 3616.0654296875,
+      "completions/min_length": 815.0,
+      "completions/min_terminated_length": 815.0,
+      "entropy": 0.37961139529943466,
+      "epoch": 0.05519779208831647,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.002614647848531604,
+      "learning_rate": 1e-05,
+      "loss": 0.1154,
+      "num_tokens": 41550902.0,
+      "reward": 0.765625,
+      "reward_std": 0.31512534618377686,
+      "rewards/accuracy_reward/mean": 0.765625,
+      "rewards/accuracy_reward/std": 0.42695629596710205,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000249147415161,
+      "sampling/importance_sampling_ratio/min": 0.27158376574516296,
+      "sampling/sampling_logp_difference/max": 1.3034846782684326,
+      "sampling/sampling_logp_difference/mean": 0.014523299410939217,
+      "step": 120
+    },
+    {
+      "clip_ratio/high_max": 0.00022899942905496573,
+      "clip_ratio/high_mean": 7.227375863294583e-05,
+      "clip_ratio/low_mean": 0.0001765698939379945,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.00024884364665922476,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 10759.0,
+      "completions/mean_length": 3585.65625,
+      "completions/mean_terminated_length": 3382.508056640625,
+      "completions/min_length": 578.0,
+      "completions/min_terminated_length": 578.0,
+      "entropy": 0.3841286860406399,
+      "epoch": 0.05565777368905244,
+      "frac_reward_zero_std": 0.125,
+      "grad_norm": 0.002633139258250594,
+      "learning_rate": 1e-05,
+      "loss": 0.0646,
+      "num_tokens": 41788800.0,
+      "reward": 0.5,
+      "reward_std": 0.400318443775177,
+      "rewards/accuracy_reward/mean": 0.5,
+      "rewards/accuracy_reward/std": 0.5039526224136353,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000247955322266,
+      "sampling/importance_sampling_ratio/min": 0.03442913666367531,
+      "sampling/sampling_logp_difference/max": 3.368852138519287,
+      "sampling/sampling_logp_difference/mean": 0.014772026799619198,
+      "step": 121
+    },
+    {
+      "clip_ratio/high_max": 0.00011599275330809178,
+      "clip_ratio/high_mean": 4.372763510218647e-05,
+      "clip_ratio/low_mean": 0.00016362589440177544,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.00020735352973133558,
+      "completions/clipped_ratio": 0.125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16267.0,
+      "completions/mean_length": 6827.625,
+      "completions/mean_terminated_length": 5462.4287109375,
+      "completions/min_length": 1178.0,
+      "completions/min_terminated_length": 1178.0,
+      "entropy": 0.4236124977469444,
+      "epoch": 0.05611775528978841,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0022398591972887516,
+      "learning_rate": 1e-05,
+      "loss": 0.0504,
+      "num_tokens": 42236600.0,
+      "reward": 0.625,
+      "reward_std": 0.2756394147872925,
+      "rewards/accuracy_reward/mean": 0.625,
+      "rewards/accuracy_reward/std": 0.48795005679130554,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999827146530151,
+      "sampling/importance_sampling_ratio/min": 0.0008994160452857614,
+      "sampling/sampling_logp_difference/max": 7.01376485824585,
+      "sampling/sampling_logp_difference/mean": 0.015439807437360287,
+      "step": 122
+    },
+    {
+      "clip_ratio/high_max": 5.419486842583865e-05,
+      "clip_ratio/high_mean": 2.424228341624257e-05,
+      "clip_ratio/low_mean": 0.0003505960376060102,
+      "clip_ratio/low_min": 6.290438614087179e-05,
+      "clip_ratio/region_mean": 0.00037483832056750543,
+      "completions/clipped_ratio": 0.125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15703.0,
+      "completions/mean_length": 7439.390625,
+      "completions/mean_terminated_length": 6161.58935546875,
+      "completions/min_length": 807.0,
+      "completions/min_terminated_length": 807.0,
+      "entropy": 0.42251385003328323,
+      "epoch": 0.05657773689052438,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0013182272668927908,
+      "learning_rate": 1e-05,
+      "loss": 0.0661,
+      "num_tokens": 42722049.0,
+      "reward": 0.53125,
+      "reward_std": 0.28247910737991333,
+      "rewards/accuracy_reward/mean": 0.53125,
+      "rewards/accuracy_reward/std": 0.5029674172401428,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000464916229248,
+      "sampling/importance_sampling_ratio/min": 0.009381524287164211,
+      "sampling/sampling_logp_difference/max": 4.669013023376465,
+      "sampling/sampling_logp_difference/mean": 0.017970317974686623,
+      "step": 123
+    },
+    {
+      "clip_ratio/high_max": 0.00022104287472757278,
+      "clip_ratio/high_mean": 8.386546346628165e-05,
+      "clip_ratio/low_mean": 0.00033902134964591824,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0004228868028803845,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 12644.0,
+      "completions/mean_length": 4582.703125,
+      "completions/mean_terminated_length": 4395.38134765625,
+      "completions/min_length": 710.0,
+      "completions/min_terminated_length": 710.0,
+      "entropy": 0.45571400970220566,
+      "epoch": 0.05703771849126035,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.003415483282878995,
+      "learning_rate": 1e-05,
+      "loss": 0.0003,
+      "num_tokens": 43024382.0,
+      "reward": 0.625,
+      "reward_std": 0.45134252309799194,
+      "rewards/accuracy_reward/mean": 0.625,
+      "rewards/accuracy_reward/std": 0.48795005679130554,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999266862869263,
+      "sampling/importance_sampling_ratio/min": 0.0020838617347180843,
+      "sampling/sampling_logp_difference/max": 6.173532485961914,
+      "sampling/sampling_logp_difference/mean": 0.017238060012459755,
+      "step": 124
+    },
+    {
+      "clip_ratio/high_max": 0.0001726137379591819,
+      "clip_ratio/high_mean": 5.8308734878664836e-05,
+      "clip_ratio/low_mean": 0.0001304974630329525,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.00018880619791161735,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 12312.0,
+      "completions/mean_length": 3796.65625,
+      "completions/mean_terminated_length": 3177.6064453125,
+      "completions/min_length": 628.0,
+      "completions/min_terminated_length": 628.0,
+      "entropy": 0.36019331216812134,
+      "epoch": 0.05749770009199632,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.004071601200848818,
+      "learning_rate": 1e-05,
+      "loss": -0.0787,
+      "num_tokens": 43277528.0,
+      "reward": 0.625,
+      "reward_std": 0.3787454068660736,
+      "rewards/accuracy_reward/mean": 0.625,
+      "rewards/accuracy_reward/std": 0.48795005679130554,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999937891960144,
+      "sampling/importance_sampling_ratio/min": 0.3052186071872711,
+      "sampling/sampling_logp_difference/max": 1.432037115097046,
+      "sampling/sampling_logp_difference/mean": 0.01319027692079544,
+      "step": 125
+    },
+    {
+      "clip_ratio/high_max": 7.133460348995868e-05,
+      "clip_ratio/high_mean": 2.1890245989197865e-05,
+      "clip_ratio/low_mean": 0.00012619525250556762,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0001480855021327443,
+      "completions/clipped_ratio": 0.1875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16284.0,
+      "completions/mean_length": 9682.125,
+      "completions/mean_terminated_length": 8135.53857421875,
+      "completions/min_length": 617.0,
+      "completions/min_terminated_length": 617.0,
+      "entropy": 0.45171455293893814,
+      "epoch": 0.05795768169273229,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0005967849283479154,
+      "learning_rate": 1e-05,
+      "loss": 0.0396,
+      "num_tokens": 43907504.0,
+      "reward": 0.40625,
+      "reward_std": 0.24511480331420898,
+      "rewards/accuracy_reward/mean": 0.40625,
+      "rewards/accuracy_reward/std": 0.49501484632492065,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000497102737427,
+      "sampling/importance_sampling_ratio/min": 0.08479011803865433,
+      "sampling/sampling_logp_difference/max": 2.467576265335083,
+      "sampling/sampling_logp_difference/mean": 0.01833641156554222,
+      "step": 126
+    },
+    {
+      "clip_ratio/high_max": 0.00015240923221426783,
+      "clip_ratio/high_mean": 4.380486257105076e-05,
+      "clip_ratio/low_mean": 0.000209838211958413,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0002536430743020901,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15933.0,
+      "completions/mean_length": 6274.828125,
+      "completions/mean_terminated_length": 5600.8837890625,
+      "completions/min_length": 1218.0,
+      "completions/min_terminated_length": 1218.0,
+      "entropy": 0.43740469962358475,
+      "epoch": 0.05841766329346826,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.002006928203627467,
+      "learning_rate": 1e-05,
+      "loss": 0.0485,
+      "num_tokens": 44318477.0,
+      "reward": 0.609375,
+      "reward_std": 0.27883461117744446,
+      "rewards/accuracy_reward/mean": 0.609375,
+      "rewards/accuracy_reward/std": 0.4917473793029785,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.000008225440979,
+      "sampling/importance_sampling_ratio/min": 0.006873338017612696,
+      "sampling/sampling_logp_difference/max": 4.980105400085449,
+      "sampling/sampling_logp_difference/mean": 0.0172873605042696,
+      "step": 127
+    },
+    {
+      "clip_ratio/high_max": 3.4462957046343945e-05,
+      "clip_ratio/high_mean": 8.615739261585986e-06,
+      "clip_ratio/low_mean": 0.00021862963694729842,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.00022724537666363176,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15911.0,
+      "completions/max_terminated_length": 15911.0,
+      "completions/mean_length": 4089.046875,
+      "completions/mean_terminated_length": 4089.046875,
+      "completions/min_length": 917.0,
+      "completions/min_terminated_length": 917.0,
+      "entropy": 0.44774849712848663,
+      "epoch": 0.05887764489420423,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.00209969119168818,
+      "learning_rate": 1e-05,
+      "loss": -0.0304,
+      "num_tokens": 44589816.0,
+      "reward": 0.421875,
+      "reward_std": 0.34717273712158203,
+      "rewards/accuracy_reward/mean": 0.421875,
+      "rewards/accuracy_reward/std": 0.49776285886764526,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000078678131104,
+      "sampling/importance_sampling_ratio/min": 0.29485389590263367,
+      "sampling/sampling_logp_difference/max": 1.2212753295898438,
+      "sampling/sampling_logp_difference/mean": 0.014491476118564606,
+      "step": 128
+    },
+    {
+      "clip_ratio/high_max": 0.00016890352435439127,
+      "clip_ratio/high_mean": 5.8580551922204904e-05,
+      "clip_ratio/low_mean": 0.00029687383357668296,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.00035545438731787726,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 12514.0,
+      "completions/max_terminated_length": 12514.0,
+      "completions/mean_length": 4120.46875,
+      "completions/mean_terminated_length": 4120.46875,
+      "completions/min_length": 570.0,
+      "completions/min_terminated_length": 570.0,
+      "entropy": 0.37828731164336205,
+      "epoch": 0.059337626494940204,
+      "frac_reward_zero_std": 0.125,
+      "grad_norm": 0.00247256294824183,
+      "learning_rate": 1e-05,
+      "loss": -0.0719,
+      "num_tokens": 44863870.0,
+      "reward": 0.640625,
+      "reward_std": 0.37298911809921265,
+      "rewards/accuracy_reward/mean": 0.640625,
+      "rewards/accuracy_reward/std": 0.4836103618144989,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999027252197266,
+      "sampling/importance_sampling_ratio/min": 0.25366994738578796,
+      "sampling/sampling_logp_difference/max": 1.3717212677001953,
+      "sampling/sampling_logp_difference/mean": 0.01514413021504879,
+      "step": 129
+    },
+    {
+      "clip_ratio/high_max": 6.054695404600352e-05,
+      "clip_ratio/high_mean": 1.513673851150088e-05,
+      "clip_ratio/low_mean": 0.00011639616241154727,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.00013153290092304815,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16204.0,
+      "completions/mean_length": 4336.640625,
+      "completions/mean_terminated_length": 4145.4130859375,
+      "completions/min_length": 193.0,
+      "completions/min_terminated_length": 193.0,
+      "entropy": 0.5505912192165852,
+      "epoch": 0.05979760809567617,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.002205714350566268,
+      "learning_rate": 1e-05,
+      "loss": 0.0071,
+      "num_tokens": 45150095.0,
+      "reward": 0.265625,
+      "reward_std": 0.3051002323627472,
+      "rewards/accuracy_reward/mean": 0.265625,
+      "rewards/accuracy_reward/std": 0.44515693187713623,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0001391172409058,
+      "sampling/importance_sampling_ratio/min": 0.06648644804954529,
+      "sampling/sampling_logp_difference/max": 2.710757255554199,
+      "sampling/sampling_logp_difference/mean": 0.017366381362080574,
+      "step": 130
+    },
+    {
+      "clip_ratio/high_max": 0.00013182537441025488,
+      "clip_ratio/high_mean": 3.979877965321066e-05,
+      "clip_ratio/low_mean": 0.000278371120657539,
+      "clip_ratio/low_min": 2.8801843654946424e-05,
+      "clip_ratio/region_mean": 0.00031816989758226555,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15715.0,
+      "completions/mean_length": 5626.6875,
+      "completions/mean_terminated_length": 5279.67724609375,
+      "completions/min_length": 859.0,
+      "completions/min_terminated_length": 859.0,
+      "entropy": 0.4813901446759701,
+      "epoch": 0.060257589696412144,
+      "frac_reward_zero_std": 0.125,
+      "grad_norm": 0.0019324850291013718,
+      "learning_rate": 1e-05,
+      "loss": 0.0192,
+      "num_tokens": 45520363.0,
+      "reward": 0.59375,
+      "reward_std": 0.3956102430820465,
+      "rewards/accuracy_reward/mean": 0.59375,
+      "rewards/accuracy_reward/std": 0.49501484632492065,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999158382415771,
+      "sampling/importance_sampling_ratio/min": 0.0007231447380036116,
+      "sampling/sampling_logp_difference/max": 7.231901168823242,
+      "sampling/sampling_logp_difference/mean": 0.01795651763677597,
+      "step": 131
+    },
+    {
+      "clip_ratio/high_max": 0.0001917856679938268,
+      "clip_ratio/high_mean": 7.190962878667051e-05,
+      "clip_ratio/low_mean": 0.0003002988987645949,
+      "clip_ratio/low_min": 2.3995393348741345e-05,
+      "clip_ratio/region_mean": 0.0003722085129993502,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15459.0,
+      "completions/max_terminated_length": 15459.0,
+      "completions/mean_length": 5463.125,
+      "completions/mean_terminated_length": 5463.125,
+      "completions/min_length": 822.0,
+      "completions/min_terminated_length": 822.0,
+      "entropy": 0.48498839512467384,
+      "epoch": 0.06071757129714812,
+      "frac_reward_zero_std": 0.125,
+      "grad_norm": 0.0010704556480050087,
+      "learning_rate": 1e-05,
+      "loss": 0.0345,
+      "num_tokens": 45879459.0,
+      "reward": 0.5,
+      "reward_std": 0.400318443775177,
+      "rewards/accuracy_reward/mean": 0.5,
+      "rewards/accuracy_reward/std": 0.5039526224136353,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000200271606445,
+      "sampling/importance_sampling_ratio/min": 0.24045711755752563,
+      "sampling/sampling_logp_difference/max": 1.4252135753631592,
+      "sampling/sampling_logp_difference/mean": 0.01702497899532318,
+      "step": 132
+    },
+    {
+      "clip_ratio/high_max": 0.00018260821707372088,
+      "clip_ratio/high_mean": 8.817236493996461e-05,
+      "clip_ratio/low_mean": 0.00028106225181545597,
+      "clip_ratio/low_min": 1.3495277016772889e-05,
+      "clip_ratio/region_mean": 0.00036923462903359905,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15900.0,
+      "completions/mean_length": 7030.078125,
+      "completions/mean_terminated_length": 6728.33837890625,
+      "completions/min_length": 679.0,
+      "completions/min_terminated_length": 679.0,
+      "entropy": 0.5583953745663166,
+      "epoch": 0.06117755289788408,
+      "frac_reward_zero_std": 0.125,
+      "grad_norm": 0.004807902034372091,
+      "learning_rate": 1e-05,
+      "loss": 0.0251,
+      "num_tokens": 46339208.0,
+      "reward": 0.4375,
+      "reward_std": 0.34352827072143555,
+      "rewards/accuracy_reward/mean": 0.4375,
+      "rewards/accuracy_reward/std": 0.5,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000038146972656,
+      "sampling/importance_sampling_ratio/min": 0.16341879963874817,
+      "sampling/sampling_logp_difference/max": 1.811439037322998,
+      "sampling/sampling_logp_difference/mean": 0.02068626880645752,
+      "step": 133
+    },
+    {
+      "clip_ratio/high_max": 6.616325117647648e-05,
+      "clip_ratio/high_mean": 1.654081279411912e-05,
+      "clip_ratio/low_mean": 0.00023565934952785028,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0002522001623219694,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 10388.0,
+      "completions/mean_length": 3639.796875,
+      "completions/mean_terminated_length": 3228.693359375,
+      "completions/min_length": 650.0,
+      "completions/min_terminated_length": 650.0,
+      "entropy": 0.3612133227288723,
+      "epoch": 0.061637534498620056,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.003295397385954857,
+      "learning_rate": 1e-05,
+      "loss": 0.0954,
+      "num_tokens": 46581867.0,
+      "reward": 0.53125,
+      "reward_std": 0.2756393849849701,
+      "rewards/accuracy_reward/mean": 0.53125,
+      "rewards/accuracy_reward/std": 0.5029674172401428,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999962329864502,
+      "sampling/importance_sampling_ratio/min": 0.26906853914260864,
+      "sampling/sampling_logp_difference/max": 1.3127890825271606,
+      "sampling/sampling_logp_difference/mean": 0.013889246620237827,
+      "step": 134
+    },
+    {
+      "clip_ratio/high_max": 0.00010353518246120075,
+      "clip_ratio/high_mean": 2.7723654284272925e-05,
+      "clip_ratio/low_mean": 0.00025271423010053695,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.00028043788643117296,
+      "completions/clipped_ratio": 0.078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15826.0,
+      "completions/mean_length": 6138.40625,
+      "completions/mean_terminated_length": 5270.1357421875,
+      "completions/min_length": 948.0,
+      "completions/min_terminated_length": 948.0,
+      "entropy": 0.46735797077417374,
+      "epoch": 0.06209751609935603,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0006881815497763455,
+      "learning_rate": 1e-05,
+      "loss": -0.0115,
+      "num_tokens": 46984573.0,
+      "reward": 0.375,
+      "reward_std": 0.2619796395301819,
+      "rewards/accuracy_reward/mean": 0.375,
+      "rewards/accuracy_reward/std": 0.48795005679130554,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0001217126846313,
+      "sampling/importance_sampling_ratio/min": 0.1895102858543396,
+      "sampling/sampling_logp_difference/max": 1.6633119583129883,
+      "sampling/sampling_logp_difference/mean": 0.018007703125476837,
+      "step": 135
+    },
+    {
+      "clip_ratio/high_max": 0.0001601026592652488,
+      "clip_ratio/high_mean": 6.810909269461263e-05,
+      "clip_ratio/low_mean": 0.00029509376508940477,
+      "clip_ratio/low_min": 8.118738332996145e-05,
+      "clip_ratio/region_mean": 0.0003632028547144728,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16061.0,
+      "completions/mean_length": 6203.078125,
+      "completions/mean_terminated_length": 6041.4765625,
+      "completions/min_length": 994.0,
+      "completions/min_terminated_length": 994.0,
+      "entropy": 0.34924061596393585,
+      "epoch": 0.062557497700092,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.0030411158222705126,
+      "learning_rate": 1e-05,
+      "loss": 0.0609,
+      "num_tokens": 47390530.0,
+      "reward": 0.5625,
+      "reward_std": 0.45134252309799194,
+      "rewards/accuracy_reward/mean": 0.5625,
+      "rewards/accuracy_reward/std": 0.5,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000066757202148,
+      "sampling/importance_sampling_ratio/min": 0.0038473873864859343,
+      "sampling/sampling_logp_difference/max": 5.560360908508301,
+      "sampling/sampling_logp_difference/mean": 0.014004556462168694,
+      "step": 136
+    },
+    {
+      "clip_ratio/high_max": 0.00013779216351395007,
+      "clip_ratio/high_mean": 3.622300414463098e-05,
+      "clip_ratio/low_mean": 0.000246863734901126,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0002830867379088886,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16327.0,
+      "completions/mean_length": 5275.09375,
+      "completions/mean_terminated_length": 4534.5,
+      "completions/min_length": 550.0,
+      "completions/min_terminated_length": 550.0,
+      "entropy": 0.48541659861803055,
+      "epoch": 0.06301747930082796,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0019960978534072638,
+      "learning_rate": 1e-05,
+      "loss": 0.0946,
+      "num_tokens": 47738616.0,
+      "reward": 0.5625,
+      "reward_std": 0.3787454068660736,
+      "rewards/accuracy_reward/mean": 0.5625,
+      "rewards/accuracy_reward/std": 0.5,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998708367347717,
+      "sampling/importance_sampling_ratio/min": 0.25954926013946533,
+      "sampling/sampling_logp_difference/max": 2.035876750946045,
+      "sampling/sampling_logp_difference/mean": 0.01738009974360466,
+      "step": 137
+    },
+    {
+      "clip_ratio/high_max": 6.566441879840568e-05,
+      "clip_ratio/high_mean": 1.9573946701711975e-05,
+      "clip_ratio/low_mean": 0.00018548900698078796,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.00020506295550148934,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16329.0,
+      "completions/mean_length": 6902.90625,
+      "completions/mean_terminated_length": 6270.83349609375,
+      "completions/min_length": 572.0,
+      "completions/min_terminated_length": 572.0,
+      "entropy": 0.5758580937981606,
+      "epoch": 0.06347746090156393,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0018249191343784332,
+      "learning_rate": 1e-05,
+      "loss": 0.0649,
+      "num_tokens": 48190938.0,
+      "reward": 0.3125,
+      "reward_std": 0.22461533546447754,
+      "rewards/accuracy_reward/mean": 0.3125,
+      "rewards/accuracy_reward/std": 0.467176616191864,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000348091125488,
+      "sampling/importance_sampling_ratio/min": 0.0013268361799418926,
+      "sampling/sampling_logp_difference/max": 6.624958038330078,
+      "sampling/sampling_logp_difference/mean": 0.02041659690439701,
+      "step": 138
+    },
+    {
+      "clip_ratio/high_max": 0.00017541761735628825,
+      "clip_ratio/high_mean": 4.709997801910504e-05,
+      "clip_ratio/low_mean": 0.0001230241168741486,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0001701240935290116,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16243.0,
+      "completions/mean_length": 6085.34375,
+      "completions/mean_terminated_length": 5578.85205078125,
+      "completions/min_length": 735.0,
+      "completions/min_terminated_length": 735.0,
+      "entropy": 0.42954346910119057,
+      "epoch": 0.06393744250229991,
+      "frac_reward_zero_std": 0.625,
+      "grad_norm": 0.000927706656511873,
+      "learning_rate": 1e-05,
+      "loss": -0.0193,
+      "num_tokens": 48589088.0,
+      "reward": 0.515625,
+      "reward_std": 0.17782479524612427,
+      "rewards/accuracy_reward/mean": 0.515625,
+      "rewards/accuracy_reward/std": 0.5037065148353577,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999110102653503,
+      "sampling/importance_sampling_ratio/min": 0.015892520546913147,
+      "sampling/sampling_logp_difference/max": 4.14190673828125,
+      "sampling/sampling_logp_difference/mean": 0.016232255846261978,
+      "step": 139
+    },
+    {
+      "clip_ratio/high_max": 0.00010605377701722318,
+      "clip_ratio/high_mean": 2.6513444254305796e-05,
+      "clip_ratio/low_mean": 0.00017001426112983609,
+      "clip_ratio/low_min": 9.667440281191375e-06,
+      "clip_ratio/region_mean": 0.00019652770470202086,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 11692.0,
+      "completions/mean_length": 4587.140625,
+      "completions/mean_terminated_length": 3800.68359375,
+      "completions/min_length": 650.0,
+      "completions/min_terminated_length": 650.0,
+      "entropy": 0.3784865029156208,
+      "epoch": 0.06439742410303588,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.002462266944348812,
+      "learning_rate": 1e-05,
+      "loss": 0.1149,
+      "num_tokens": 48891993.0,
+      "reward": 0.75,
+      "reward_std": 0.3119301199913025,
+      "rewards/accuracy_reward/mean": 0.75,
+      "rewards/accuracy_reward/std": 0.4364357888698578,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999458193778992,
+      "sampling/importance_sampling_ratio/min": 0.04250956326723099,
+      "sampling/sampling_logp_difference/max": 3.1580262184143066,
+      "sampling/sampling_logp_difference/mean": 0.013811696320772171,
+      "step": 140
+    },
+    {
+      "clip_ratio/high_max": 2.936265809694305e-05,
+      "clip_ratio/high_mean": 7.340664524235763e-06,
+      "clip_ratio/low_mean": 4.854745839111274e-06,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 1.2195410363347037e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14722.0,
+      "completions/mean_length": 5666.703125,
+      "completions/mean_terminated_length": 5139.62255859375,
+      "completions/min_length": 966.0,
+      "completions/min_terminated_length": 966.0,
+      "entropy": 0.39824797213077545,
+      "epoch": 0.06485740570377185,
+      "frac_reward_zero_std": 0.625,
+      "grad_norm": 0.0003115626168437302,
+      "learning_rate": 1e-05,
+      "loss": -0.0184,
+      "num_tokens": 49264062.0,
+      "reward": 0.5625,
+      "reward_std": 0.1462520956993103,
+      "rewards/accuracy_reward/mean": 0.5625,
+      "rewards/accuracy_reward/std": 0.5,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000395774841309,
+      "sampling/importance_sampling_ratio/min": 0.06243892386555672,
+      "sampling/sampling_logp_difference/max": 2.773566484451294,
+      "sampling/sampling_logp_difference/mean": 0.015739524737000465,
+      "step": 141
+    },
+    {
+      "clip_ratio/high_max": 0.000158108177856775,
+      "clip_ratio/high_mean": 5.2915278502041474e-05,
+      "clip_ratio/low_mean": 0.00016461382892885013,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.00021752910970462835,
+      "completions/clipped_ratio": 0.109375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15259.0,
+      "completions/mean_length": 6887.734375,
+      "completions/mean_terminated_length": 5721.5263671875,
+      "completions/min_length": 425.0,
+      "completions/min_terminated_length": 425.0,
+      "entropy": 0.38760824128985405,
+      "epoch": 0.06531738730450783,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0006745394202880561,
+      "learning_rate": 1e-05,
+      "loss": 0.0056,
+      "num_tokens": 49713061.0,
+      "reward": 0.65625,
+      "reward_std": 0.34034284949302673,
+      "rewards/accuracy_reward/mean": 0.65625,
+      "rewards/accuracy_reward/std": 0.4787135720252991,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999780654907227,
+      "sampling/importance_sampling_ratio/min": 0.06032606586813927,
+      "sampling/sampling_logp_difference/max": 2.8079910278320312,
+      "sampling/sampling_logp_difference/mean": 0.01489229779690504,
+      "step": 142
+    },
+    {
+      "clip_ratio/high_max": 0.00015935591181914788,
+      "clip_ratio/high_mean": 6.387877647284768e-05,
+      "clip_ratio/low_mean": 0.0001730375179249677,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.00023691629667155212,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15544.0,
+      "completions/mean_length": 5005.8125,
+      "completions/mean_terminated_length": 4446.2294921875,
+      "completions/min_length": 628.0,
+      "completions/min_terminated_length": 628.0,
+      "entropy": 0.4111749045550823,
+      "epoch": 0.06577736890524379,
+      "frac_reward_zero_std": 0.125,
+      "grad_norm": 0.002489981707185507,
+      "learning_rate": 1e-05,
+      "loss": 0.1439,
+      "num_tokens": 50041409.0,
+      "reward": 0.609375,
+      "reward_std": 0.38664886355400085,
+      "rewards/accuracy_reward/mean": 0.609375,
+      "rewards/accuracy_reward/std": 0.4917473793029785,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999486207962036,
+      "sampling/importance_sampling_ratio/min": 0.08146519958972931,
+      "sampling/sampling_logp_difference/max": 2.5075793266296387,
+      "sampling/sampling_logp_difference/mean": 0.015489751473069191,
+      "step": 143
+    },
+    {
+      "clip_ratio/high_max": 0.00025049392115761293,
+      "clip_ratio/high_mean": 8.004182222975942e-05,
+      "clip_ratio/low_mean": 0.00020983324338885723,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.00028987506448174827,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 16315.0,
+      "completions/max_terminated_length": 16315.0,
+      "completions/mean_length": 5179.40625,
+      "completions/mean_terminated_length": 5179.40625,
+      "completions/min_length": 318.0,
+      "completions/min_terminated_length": 318.0,
+      "entropy": 0.41279230639338493,
+      "epoch": 0.06623735050597976,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.002926065819337964,
+      "learning_rate": 1e-05,
+      "loss": 0.1158,
+      "num_tokens": 50382171.0,
+      "reward": 0.671875,
+      "reward_std": 0.40822193026542664,
+      "rewards/accuracy_reward/mean": 0.671875,
+      "rewards/accuracy_reward/std": 0.4732423722743988,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999444484710693,
+      "sampling/importance_sampling_ratio/min": 0.25520530343055725,
+      "sampling/sampling_logp_difference/max": 1.6402530670166016,
+      "sampling/sampling_logp_difference/mean": 0.015069128945469856,
+      "step": 144
+    },
+    {
+      "clip_ratio/high_max": 0.00014712631036672974,
+      "clip_ratio/high_mean": 5.468455719892518e-05,
+      "clip_ratio/low_mean": 0.00023898459858173737,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0002936691580543993,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 10295.0,
+      "completions/mean_length": 4353.640625,
+      "completions/mean_terminated_length": 3965.564453125,
+      "completions/min_length": 554.0,
+      "completions/min_terminated_length": 554.0,
+      "entropy": 0.3548976257443428,
+      "epoch": 0.06669733210671573,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.0013896445743739605,
+      "learning_rate": 1e-05,
+      "loss": 0.1011,
+      "num_tokens": 50669884.0,
+      "reward": 0.609375,
+      "reward_std": 0.4150616228580475,
+      "rewards/accuracy_reward/mean": 0.609375,
+      "rewards/accuracy_reward/std": 0.4917473793029785,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.000076174736023,
+      "sampling/importance_sampling_ratio/min": 0.3658331036567688,
+      "sampling/sampling_logp_difference/max": 1.0578279495239258,
+      "sampling/sampling_logp_difference/mean": 0.01400618627667427,
+      "step": 145
+    },
+    {
+      "clip_ratio/high_max": 0.00018550837739894632,
+      "clip_ratio/high_mean": 5.588274325418752e-05,
+      "clip_ratio/low_mean": 0.00013934067465015687,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.00019522341381161823,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 11380.0,
+      "completions/max_terminated_length": 11380.0,
+      "completions/mean_length": 3818.765625,
+      "completions/mean_terminated_length": 3818.765625,
+      "completions/min_length": 354.0,
+      "completions/min_terminated_length": 354.0,
+      "entropy": 0.4553304873406887,
+      "epoch": 0.0671573137074517,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0022751668002456427,
+      "learning_rate": 1e-05,
+      "loss": 0.0159,
+      "num_tokens": 50923221.0,
+      "reward": 0.59375,
+      "reward_std": 0.23356688022613525,
+      "rewards/accuracy_reward/mean": 0.59375,
+      "rewards/accuracy_reward/std": 0.49501484632492065,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999451637268066,
+      "sampling/importance_sampling_ratio/min": 0.4037473499774933,
+      "sampling/sampling_logp_difference/max": 0.906965970993042,
+      "sampling/sampling_logp_difference/mean": 0.017479849979281425,
+      "step": 146
+    },
+    {
+      "clip_ratio/high_max": 0.00012334420534898527,
+      "clip_ratio/high_mean": 3.775972527364502e-05,
+      "clip_ratio/low_mean": 0.00024554891206207685,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.00028330863642622717,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15170.0,
+      "completions/mean_length": 4132.859375,
+      "completions/mean_terminated_length": 3938.39697265625,
+      "completions/min_length": 847.0,
+      "completions/min_terminated_length": 847.0,
+      "entropy": 0.3993053063750267,
+      "epoch": 0.06761729530818768,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.003922725562006235,
+      "learning_rate": 1e-05,
+      "loss": 0.0842,
+      "num_tokens": 51197700.0,
+      "reward": 0.703125,
+      "reward_std": 0.3740273714065552,
+      "rewards/accuracy_reward/mean": 0.703125,
+      "rewards/accuracy_reward/std": 0.4604927599430084,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0002050399780273,
+      "sampling/importance_sampling_ratio/min": 0.09134452044963837,
+      "sampling/sampling_logp_difference/max": 2.3931169509887695,
+      "sampling/sampling_logp_difference/mean": 0.01436243113130331,
+      "step": 147
+    },
+    {
+      "clip_ratio/high_max": 6.435603609133977e-05,
+      "clip_ratio/high_mean": 1.806610680432641e-05,
+      "clip_ratio/low_mean": 0.0001659406625549309,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0001840067711782467,
+      "completions/clipped_ratio": 0.078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16096.0,
+      "completions/mean_length": 5267.796875,
+      "completions/mean_terminated_length": 4325.74560546875,
+      "completions/min_length": 235.0,
+      "completions/min_terminated_length": 235.0,
+      "entropy": 0.38123703747987747,
+      "epoch": 0.06807727690892364,
+      "frac_reward_zero_std": 0.125,
+      "grad_norm": 0.0020941398106515408,
+      "learning_rate": 1e-05,
+      "loss": -0.0609,
+      "num_tokens": 51543671.0,
+      "reward": 0.53125,
+      "reward_std": 0.400318443775177,
+      "rewards/accuracy_reward/mean": 0.53125,
+      "rewards/accuracy_reward/std": 0.5029674172401428,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999316930770874,
+      "sampling/importance_sampling_ratio/min": 0.23037536442279816,
+      "sampling/sampling_logp_difference/max": 2.5325393676757812,
+      "sampling/sampling_logp_difference/mean": 0.014637207612395287,
+      "step": 148
+    },
+    {
+      "clip_ratio/high_max": 9.257747115043458e-05,
+      "clip_ratio/high_mean": 3.454523766777129e-05,
+      "clip_ratio/low_mean": 0.00017795059829950333,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.00021249583460303256,
+      "completions/clipped_ratio": 0.109375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16133.0,
+      "completions/mean_length": 7131.109375,
+      "completions/mean_terminated_length": 5994.78955078125,
+      "completions/min_length": 618.0,
+      "completions/min_terminated_length": 618.0,
+      "entropy": 0.5104082711040974,
+      "epoch": 0.06853725850965961,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.00497112050652504,
+      "learning_rate": 1e-05,
+      "loss": 0.0182,
+      "num_tokens": 52009022.0,
+      "reward": 0.421875,
+      "reward_std": 0.2382849156856537,
+      "rewards/accuracy_reward/mean": 0.421875,
+      "rewards/accuracy_reward/std": 0.49776285886764526,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.000052571296692,
+      "sampling/importance_sampling_ratio/min": 0.1670650690793991,
+      "sampling/sampling_logp_difference/max": 2.297494888305664,
+      "sampling/sampling_logp_difference/mean": 0.01985531486570835,
+      "step": 149
+    },
+    {
+      "clip_ratio/high_max": 0.00023302506269828882,
+      "clip_ratio/high_mean": 6.607658588109189e-05,
+      "clip_ratio/low_mean": 0.0002972222391690593,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.00036329882095742505,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14592.0,
+      "completions/mean_length": 5244.09375,
+      "completions/mean_terminated_length": 4696.22900390625,
+      "completions/min_length": 979.0,
+      "completions/min_terminated_length": 979.0,
+      "entropy": 0.43432193621993065,
+      "epoch": 0.06899724011039558,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0020549402106553316,
+      "learning_rate": 1e-05,
+      "loss": 0.0465,
+      "num_tokens": 52355452.0,
+      "reward": 0.578125,
+      "reward_std": 0.32878512144088745,
+      "rewards/accuracy_reward/mean": 0.578125,
+      "rewards/accuracy_reward/std": 0.49776285886764526,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999597668647766,
+      "sampling/importance_sampling_ratio/min": 0.12826821208000183,
+      "sampling/sampling_logp_difference/max": 2.0536317825317383,
+      "sampling/sampling_logp_difference/mean": 0.01682550646364689,
+      "step": 150
+    },
+    {
+      "clip_ratio/high_max": 9.257711553800618e-05,
+      "clip_ratio/high_mean": 2.4764625095485826e-05,
+      "clip_ratio/low_mean": 0.00017624517386138905,
+      "clip_ratio/low_min": 3.04195455100853e-05,
+      "clip_ratio/region_mean": 0.00020100980009374325,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 13305.0,
+      "completions/mean_length": 4665.59375,
+      "completions/mean_terminated_length": 4287.58056640625,
+      "completions/min_length": 839.0,
+      "completions/min_terminated_length": 839.0,
+      "entropy": 0.4567765109241009,
+      "epoch": 0.06945722171113156,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0019571157172322273,
+      "learning_rate": 1e-05,
+      "loss": 0.1202,
+      "num_tokens": 52661730.0,
+      "reward": 0.609375,
+      "reward_std": 0.3766237497329712,
+      "rewards/accuracy_reward/mean": 0.609375,
+      "rewards/accuracy_reward/std": 0.4917473793029785,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0001718997955322,
+      "sampling/importance_sampling_ratio/min": 0.05863168090581894,
+      "sampling/sampling_logp_difference/max": 2.836480140686035,
+      "sampling/sampling_logp_difference/mean": 0.014873203821480274,
+      "step": 151
+    },
+    {
+      "clip_ratio/high_max": 0.0001826390007408918,
+      "clip_ratio/high_mean": 4.919741900266672e-05,
+      "clip_ratio/low_mean": 0.00017080424390769622,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0002200016633651103,
+      "completions/clipped_ratio": 0.078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15613.0,
+      "completions/mean_length": 5884.078125,
+      "completions/mean_terminated_length": 4994.25439453125,
+      "completions/min_length": 395.0,
+      "completions/min_terminated_length": 395.0,
+      "entropy": 0.46595389023423195,
+      "epoch": 0.06991720331186753,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.001585305784828961,
+      "learning_rate": 1e-05,
+      "loss": -0.01,
+      "num_tokens": 53047807.0,
+      "reward": 0.6875,
+      "reward_std": 0.29143065214157104,
+      "rewards/accuracy_reward/mean": 0.6875,
+      "rewards/accuracy_reward/std": 0.467176616191864,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000269412994385,
+      "sampling/importance_sampling_ratio/min": 0.22881244122982025,
+      "sampling/sampling_logp_difference/max": 1.76997709274292,
+      "sampling/sampling_logp_difference/mean": 0.017738256603479385,
+      "step": 152
+    },
+    {
+      "clip_ratio/high_max": 0.00021771475485365954,
+      "clip_ratio/high_mean": 6.966911666950182e-05,
+      "clip_ratio/low_mean": 0.0002597284528746968,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0003293975751148537,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15895.0,
+      "completions/mean_length": 6931.078125,
+      "completions/mean_terminated_length": 6781.0322265625,
+      "completions/min_length": 1018.0,
+      "completions/min_terminated_length": 1018.0,
+      "entropy": 0.4630916491150856,
+      "epoch": 0.0703771849126035,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.001962294103577733,
+      "learning_rate": 1e-05,
+      "loss": 0.0243,
+      "num_tokens": 53499988.0,
+      "reward": 0.578125,
+      "reward_std": 0.32407689094543457,
+      "rewards/accuracy_reward/mean": 0.578125,
+      "rewards/accuracy_reward/std": 0.49776285886764526,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998897910118103,
+      "sampling/importance_sampling_ratio/min": 0.250827819108963,
+      "sampling/sampling_logp_difference/max": 1.3829885721206665,
+      "sampling/sampling_logp_difference/mean": 0.019276238977909088,
+      "step": 153
+    },
+    {
+      "clip_ratio/high_max": 9.259459875465836e-05,
+      "clip_ratio/high_mean": 2.314864968866459e-05,
+      "clip_ratio/low_mean": 0.0001647196718295163,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0001878683215181809,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 16380.0,
+      "completions/max_terminated_length": 16380.0,
+      "completions/mean_length": 4187.609375,
+      "completions/mean_terminated_length": 4187.609375,
+      "completions/min_length": 452.0,
+      "completions/min_terminated_length": 452.0,
+      "entropy": 0.4736147038638592,
+      "epoch": 0.07083716651333946,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0039059605915099382,
+      "learning_rate": 1e-05,
+      "loss": -0.0232,
+      "num_tokens": 53777891.0,
+      "reward": 0.515625,
+      "reward_std": 0.23144522309303284,
+      "rewards/accuracy_reward/mean": 0.515625,
+      "rewards/accuracy_reward/std": 0.5037065148353577,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000485181808472,
+      "sampling/importance_sampling_ratio/min": 0.059286389499902725,
+      "sampling/sampling_logp_difference/max": 2.825375556945801,
+      "sampling/sampling_logp_difference/mean": 0.017982449382543564,
+      "step": 154
+    },
+    {
+      "clip_ratio/high_max": 0.0002281807282997761,
+      "clip_ratio/high_mean": 6.215211851667846e-05,
+      "clip_ratio/low_mean": 0.00041431118188484106,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0004764632985825301,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 11462.0,
+      "completions/max_terminated_length": 11462.0,
+      "completions/mean_length": 4636.609375,
+      "completions/mean_terminated_length": 4636.609375,
+      "completions/min_length": 728.0,
+      "completions/min_terminated_length": 728.0,
+      "entropy": 0.4699357636272907,
+      "epoch": 0.07129714811407543,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0015674931928515434,
+      "learning_rate": 1e-05,
+      "loss": 0.0001,
+      "num_tokens": 54083730.0,
+      "reward": 0.421875,
+      "reward_std": 0.32407689094543457,
+      "rewards/accuracy_reward/mean": 0.421875,
+      "rewards/accuracy_reward/std": 0.49776285886764526,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999392032623291,
+      "sampling/importance_sampling_ratio/min": 0.3012267053127289,
+      "sampling/sampling_logp_difference/max": 1.4397335052490234,
+      "sampling/sampling_logp_difference/mean": 0.018028534948825836,
+      "step": 155
+    },
+    {
+      "clip_ratio/high_max": 0.00019255842562415637,
+      "clip_ratio/high_mean": 6.385802771546878e-05,
+      "clip_ratio/low_mean": 0.00015325745880545583,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.00021711548879466136,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14616.0,
+      "completions/mean_length": 4546.90625,
+      "completions/mean_terminated_length": 4165.064453125,
+      "completions/min_length": 1046.0,
+      "completions/min_terminated_length": 1046.0,
+      "entropy": 0.3721245974302292,
+      "epoch": 0.07175712971481141,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0034301101695746183,
+      "learning_rate": 1e-05,
+      "loss": -0.0319,
+      "num_tokens": 54384076.0,
+      "reward": 0.703125,
+      "reward_std": 0.344576358795166,
+      "rewards/accuracy_reward/mean": 0.703125,
+      "rewards/accuracy_reward/std": 0.4604927599430084,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999713897705078,
+      "sampling/importance_sampling_ratio/min": 0.3777761459350586,
+      "sampling/sampling_logp_difference/max": 0.9734535217285156,
+      "sampling/sampling_logp_difference/mean": 0.013778747990727425,
+      "step": 156
+    },
+    {
+      "clip_ratio/high_max": 0.0002033198470599018,
+      "clip_ratio/high_mean": 8.211985141315381e-05,
+      "clip_ratio/low_mean": 0.00019877607019225252,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.00028089592706237454,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15427.0,
+      "completions/mean_length": 5861.140625,
+      "completions/mean_terminated_length": 5159.61669921875,
+      "completions/min_length": 922.0,
+      "completions/min_terminated_length": 922.0,
+      "entropy": 0.402770210057497,
+      "epoch": 0.07221711131554738,
+      "frac_reward_zero_std": 0.125,
+      "grad_norm": 0.0019413913832977414,
+      "learning_rate": 1e-05,
+      "loss": -0.015,
+      "num_tokens": 54768301.0,
+      "reward": 0.6875,
+      "reward_std": 0.3913668990135193,
+      "rewards/accuracy_reward/mean": 0.6875,
+      "rewards/accuracy_reward/std": 0.467176616191864,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999744892120361,
+      "sampling/importance_sampling_ratio/min": 0.0876406580209732,
+      "sampling/sampling_logp_difference/max": 2.4345102310180664,
+      "sampling/sampling_logp_difference/mean": 0.01538553275167942,
+      "step": 157
+    },
+    {
+      "clip_ratio/high_max": 0.00023284805683942977,
+      "clip_ratio/high_mean": 8.598793738201493e-05,
+      "clip_ratio/low_mean": 0.0003366365535839577,
+      "clip_ratio/low_min": 4.388567595015047e-05,
+      "clip_ratio/region_mean": 0.00042262449642294087,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16110.0,
+      "completions/mean_length": 4339.15625,
+      "completions/mean_terminated_length": 4147.96826171875,
+      "completions/min_length": 589.0,
+      "completions/min_terminated_length": 589.0,
+      "entropy": 0.41392209380865097,
+      "epoch": 0.07267709291628335,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.0042099012061953545,
+      "learning_rate": 1e-05,
+      "loss": 0.1212,
+      "num_tokens": 55056167.0,
+      "reward": 0.5625,
+      "reward_std": 0.4991811513900757,
+      "rewards/accuracy_reward/mean": 0.5625,
+      "rewards/accuracy_reward/std": 0.5,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999838471412659,
+      "sampling/importance_sampling_ratio/min": 0.19788731634616852,
+      "sampling/sampling_logp_difference/max": 1.8298425674438477,
+      "sampling/sampling_logp_difference/mean": 0.016492057591676712,
+      "step": 158
+    },
+    {
+      "clip_ratio/high_max": 0.00010216615919489413,
+      "clip_ratio/high_mean": 2.956860225822311e-05,
+      "clip_ratio/low_mean": 0.00016890704591787653,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0001984756468118576,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 11991.0,
+      "completions/mean_length": 3860.734375,
+      "completions/mean_terminated_length": 3661.95263671875,
+      "completions/min_length": 315.0,
+      "completions/min_terminated_length": 315.0,
+      "entropy": 0.4760429188609123,
+      "epoch": 0.07313707451701931,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0018255900358781219,
+      "learning_rate": 1e-05,
+      "loss": -0.0176,
+      "num_tokens": 55311598.0,
+      "reward": 0.5625,
+      "reward_std": 0.3335031569004059,
+      "rewards/accuracy_reward/mean": 0.5625,
+      "rewards/accuracy_reward/std": 0.5,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000277757644653,
+      "sampling/importance_sampling_ratio/min": 0.14818522334098816,
+      "sampling/sampling_logp_difference/max": 1.909292221069336,
+      "sampling/sampling_logp_difference/mean": 0.01704004406929016,
+      "step": 159
+    },
+    {
+      "clip_ratio/high_max": 0.0001849941472755745,
+      "clip_ratio/high_mean": 4.898101587968995e-05,
+      "clip_ratio/low_mean": 0.0001777787338141934,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0002267597501486307,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15737.0,
+      "completions/mean_length": 5348.65625,
+      "completions/mean_terminated_length": 4805.93408203125,
+      "completions/min_length": 399.0,
+      "completions/min_terminated_length": 399.0,
+      "entropy": 0.39790400862693787,
+      "epoch": 0.07359705611775529,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0014152417425066233,
+      "learning_rate": 1e-05,
+      "loss": -0.0372,
+      "num_tokens": 55662544.0,
+      "reward": 0.546875,
+      "reward_std": 0.3356248140335083,
+      "rewards/accuracy_reward/mean": 0.546875,
+      "rewards/accuracy_reward/std": 0.501733124256134,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999570846557617,
+      "sampling/importance_sampling_ratio/min": 0.08356278389692307,
+      "sampling/sampling_logp_difference/max": 2.482156991958618,
+      "sampling/sampling_logp_difference/mean": 0.016587980091571808,
+      "step": 160
+    },
+    {
+      "clip_ratio/high_max": 6.86579851389979e-05,
+      "clip_ratio/high_mean": 1.7164496284749475e-05,
+      "clip_ratio/low_mean": 0.00020281358229112811,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.00021997808198648272,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15704.0,
+      "completions/mean_length": 6315.4375,
+      "completions/mean_terminated_length": 5644.2001953125,
+      "completions/min_length": 740.0,
+      "completions/min_terminated_length": 740.0,
+      "entropy": 0.41329350695014,
+      "epoch": 0.07405703771849126,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0008581069996580482,
+      "learning_rate": 1e-05,
+      "loss": -0.078,
+      "num_tokens": 56076764.0,
+      "reward": 0.5,
+      "reward_std": 0.2961388826370239,
+      "rewards/accuracy_reward/mean": 0.5,
+      "rewards/accuracy_reward/std": 0.5039526224136353,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999759197235107,
+      "sampling/importance_sampling_ratio/min": 0.020804718136787415,
+      "sampling/sampling_logp_difference/max": 3.872575521469116,
+      "sampling/sampling_logp_difference/mean": 0.015737876296043396,
+      "step": 161
+    },
+    {
+      "clip_ratio/high_max": 0.00016506154497619718,
+      "clip_ratio/high_mean": 5.038856306782691e-05,
+      "clip_ratio/low_mean": 7.605237874486193e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.00012644094090319413,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15289.0,
+      "completions/max_terminated_length": 15289.0,
+      "completions/mean_length": 3972.796875,
+      "completions/mean_terminated_length": 3972.796875,
+      "completions/min_length": 637.0,
+      "completions/min_terminated_length": 637.0,
+      "entropy": 0.3322669602930546,
+      "epoch": 0.07451701931922723,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.002519205678254366,
+      "learning_rate": 1e-05,
+      "loss": 0.0093,
+      "num_tokens": 56340543.0,
+      "reward": 0.71875,
+      "reward_std": 0.28247910737991333,
+      "rewards/accuracy_reward/mean": 0.71875,
+      "rewards/accuracy_reward/std": 0.4531635046005249,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000035762786865,
+      "sampling/importance_sampling_ratio/min": 0.0027362185064703226,
+      "sampling/sampling_logp_difference/max": 5.901178359985352,
+      "sampling/sampling_logp_difference/mean": 0.012547864578664303,
+      "step": 162
+    },
+    {
+      "clip_ratio/high_max": 6.577009116881527e-05,
+      "clip_ratio/high_mean": 1.6442522792203818e-05,
+      "clip_ratio/low_mean": 0.0001267316197299806,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.00014317414297693176,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 12751.0,
+      "completions/max_terminated_length": 12751.0,
+      "completions/mean_length": 3359.796875,
+      "completions/mean_terminated_length": 3359.796875,
+      "completions/min_length": 315.0,
+      "completions/min_terminated_length": 315.0,
+      "entropy": 0.4078873060643673,
+      "epoch": 0.0749770009199632,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0031421470921486616,
+      "learning_rate": 1e-05,
+      "loss": -0.044,
+      "num_tokens": 56573618.0,
+      "reward": 0.46875,
+      "reward_std": 0.26409149169921875,
+      "rewards/accuracy_reward/mean": 0.46875,
+      "rewards/accuracy_reward/std": 0.5029674172401428,
+      "sampling/importance_sampling_ratio/max": 1.9408248662948608,
+      "sampling/importance_sampling_ratio/mean": 1.0000327825546265,
+      "sampling/importance_sampling_ratio/min": 0.05682510510087013,
+      "sampling/sampling_logp_difference/max": 2.867777109146118,
+      "sampling/sampling_logp_difference/mean": 0.015189846977591515,
+      "step": 163
+    },
+    {
+      "clip_ratio/high_max": 0.0002339555685466621,
+      "clip_ratio/high_mean": 5.848889213666553e-05,
+      "clip_ratio/low_mean": 0.00019084552513959352,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0002493344186405011,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16243.0,
+      "completions/mean_length": 5143.953125,
+      "completions/mean_terminated_length": 4965.5400390625,
+      "completions/min_length": 486.0,
+      "completions/min_terminated_length": 486.0,
+      "entropy": 0.45641181245446205,
+      "epoch": 0.07543698252069918,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0013033527648076415,
+      "learning_rate": 1e-05,
+      "loss": -0.001,
+      "num_tokens": 56912623.0,
+      "reward": 0.5,
+      "reward_std": 0.2540663480758667,
+      "rewards/accuracy_reward/mean": 0.5,
+      "rewards/accuracy_reward/std": 0.5039526224136353,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000520944595337,
+      "sampling/importance_sampling_ratio/min": 0.10672712326049805,
+      "sampling/sampling_logp_difference/max": 2.2374799251556396,
+      "sampling/sampling_logp_difference/mean": 0.018095262348651886,
+      "step": 164
+    },
+    {
+      "clip_ratio/high_max": 0.00015645647636119975,
+      "clip_ratio/high_mean": 5.4849623893460375e-05,
+      "clip_ratio/low_mean": 0.00022452728444477543,
+      "clip_ratio/low_min": 1.8452908989274874e-05,
+      "clip_ratio/region_mean": 0.00027937691174884094,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15524.0,
+      "completions/max_terminated_length": 15524.0,
+      "completions/mean_length": 4861.796875,
+      "completions/mean_terminated_length": 4861.796875,
+      "completions/min_length": 686.0,
+      "completions/min_terminated_length": 686.0,
+      "entropy": 0.3686519227921963,
+      "epoch": 0.07589696412143514,
+      "frac_reward_zero_std": 0.125,
+      "grad_norm": 0.002058787737041712,
+      "learning_rate": 1e-05,
+      "loss": -0.0061,
+      "num_tokens": 57231410.0,
+      "reward": 0.640625,
+      "reward_std": 0.4434390664100647,
+      "rewards/accuracy_reward/mean": 0.640625,
+      "rewards/accuracy_reward/std": 0.4836103618144989,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000720024108887,
+      "sampling/importance_sampling_ratio/min": 0.29414093494415283,
+      "sampling/sampling_logp_difference/max": 1.223696231842041,
+      "sampling/sampling_logp_difference/mean": 0.013861306011676788,
+      "step": 165
+    },
+    {
+      "clip_ratio/high_max": 7.898918920545839e-05,
+      "clip_ratio/high_mean": 1.9747297301364597e-05,
+      "clip_ratio/low_mean": 0.00023958213569130749,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0002593294357211562,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 13863.0,
+      "completions/max_terminated_length": 13863.0,
+      "completions/mean_length": 3646.5625,
+      "completions/mean_terminated_length": 3646.5625,
+      "completions/min_length": 485.0,
+      "completions/min_terminated_length": 485.0,
+      "entropy": 0.4185665175318718,
+      "epoch": 0.07635694572217111,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0021702114026993513,
+      "learning_rate": 1e-05,
+      "loss": -0.0115,
+      "num_tokens": 57476102.0,
+      "reward": 0.421875,
+      "reward_std": 0.34717273712158203,
+      "rewards/accuracy_reward/mean": 0.421875,
+      "rewards/accuracy_reward/std": 0.49776285886764526,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999930202960968,
+      "sampling/importance_sampling_ratio/min": 0.26276907324790955,
+      "sampling/sampling_logp_difference/max": 1.336479663848877,
+      "sampling/sampling_logp_difference/mean": 0.015390090644359589,
+      "step": 166
+    },
+    {
+      "clip_ratio/high_max": 9.615711496735457e-05,
+      "clip_ratio/high_mean": 2.4039278741838643e-05,
+      "clip_ratio/low_mean": 8.294612644021981e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0001069854047273111,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 12572.0,
+      "completions/max_terminated_length": 12572.0,
+      "completions/mean_length": 3261.03125,
+      "completions/mean_terminated_length": 3261.03125,
+      "completions/min_length": 395.0,
+      "completions/min_terminated_length": 395.0,
+      "entropy": 0.38933373615145683,
+      "epoch": 0.07681692732290708,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0019839778542518616,
+      "learning_rate": 1e-05,
+      "loss": 0.0163,
+      "num_tokens": 57698704.0,
+      "reward": 0.546875,
+      "reward_std": 0.24039676785469055,
+      "rewards/accuracy_reward/mean": 0.546875,
+      "rewards/accuracy_reward/std": 0.501733124256134,
+      "sampling/importance_sampling_ratio/max": 1.9696807861328125,
+      "sampling/importance_sampling_ratio/mean": 1.0000805854797363,
+      "sampling/importance_sampling_ratio/min": 0.23458817601203918,
+      "sampling/sampling_logp_difference/max": 1.4499237537384033,
+      "sampling/sampling_logp_difference/mean": 0.013868526555597782,
+      "step": 167
+    },
+    {
+      "clip_ratio/high_max": 0.00010562563329585828,
+      "clip_ratio/high_mean": 3.383952889635111e-05,
+      "clip_ratio/low_mean": 0.00020006230874969333,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.00023390183969240752,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14331.0,
+      "completions/mean_length": 4422.734375,
+      "completions/mean_terminated_length": 4036.886962890625,
+      "completions/min_length": 410.0,
+      "completions/min_terminated_length": 410.0,
+      "entropy": 0.4031215123832226,
+      "epoch": 0.07727690892364306,
+      "frac_reward_zero_std": 0.125,
+      "grad_norm": 0.001986635150387883,
+      "learning_rate": 1e-05,
+      "loss": 0.0557,
+      "num_tokens": 57993199.0,
+      "reward": 0.734375,
+      "reward_std": 0.4024401307106018,
+      "rewards/accuracy_reward/mean": 0.734375,
+      "rewards/accuracy_reward/std": 0.44515693187713623,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999784231185913,
+      "sampling/importance_sampling_ratio/min": 0.06947216391563416,
+      "sampling/sampling_logp_difference/max": 2.6668291091918945,
+      "sampling/sampling_logp_difference/mean": 0.015180530957877636,
+      "step": 168
+    },
+    {
+      "clip_ratio/high_max": 0.0001730810577100783,
+      "clip_ratio/high_mean": 4.773723605921987e-05,
+      "clip_ratio/low_mean": 0.0002442104923829902,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0002919477325349362,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16171.0,
+      "completions/mean_length": 6199.890625,
+      "completions/mean_terminated_length": 5871.37060546875,
+      "completions/min_length": 784.0,
+      "completions/min_terminated_length": 784.0,
+      "entropy": 0.5146941468119621,
+      "epoch": 0.07773689052437903,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0007542786770500243,
+      "learning_rate": 1e-05,
+      "loss": 0.1269,
+      "num_tokens": 58399648.0,
+      "reward": 0.359375,
+      "reward_std": 0.3403330445289612,
+      "rewards/accuracy_reward/mean": 0.359375,
+      "rewards/accuracy_reward/std": 0.4836103618144989,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000231266021729,
+      "sampling/importance_sampling_ratio/min": 0.19065004587173462,
+      "sampling/sampling_logp_difference/max": 1.657315731048584,
+      "sampling/sampling_logp_difference/mean": 0.018767032772302628,
+      "step": 169
+    },
+    {
+      "clip_ratio/high_max": 9.394133485329803e-05,
+      "clip_ratio/high_mean": 2.3485333713324508e-05,
+      "clip_ratio/low_mean": 0.00011168645596626448,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.00013517178831534693,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15655.0,
+      "completions/mean_length": 3896.234375,
+      "completions/mean_terminated_length": 3698.01611328125,
+      "completions/min_length": 260.0,
+      "completions/min_terminated_length": 260.0,
+      "entropy": 0.5078827440738678,
+      "epoch": 0.078196872125115,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0017290591495111585,
+      "learning_rate": 1e-05,
+      "loss": 0.0328,
+      "num_tokens": 58658479.0,
+      "reward": 0.34375,
+      "reward_std": 0.3119301199913025,
+      "rewards/accuracy_reward/mean": 0.34375,
+      "rewards/accuracy_reward/std": 0.4787135720252991,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999340772628784,
+      "sampling/importance_sampling_ratio/min": 7.327488128794357e-05,
+      "sampling/sampling_logp_difference/max": 9.521292686462402,
+      "sampling/sampling_logp_difference/mean": 0.017025060951709747,
+      "step": 170
+    },
+    {
+      "clip_ratio/high_max": 0.00020841371315327706,
+      "clip_ratio/high_mean": 8.523951714778377e-05,
+      "clip_ratio/low_mean": 0.00012622540452866815,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.00021146491963008884,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 11989.0,
+      "completions/mean_length": 4735.96875,
+      "completions/mean_terminated_length": 4360.2255859375,
+      "completions/min_length": 501.0,
+      "completions/min_terminated_length": 501.0,
+      "entropy": 0.45149439200758934,
+      "epoch": 0.07865685372585096,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0011273949639871716,
+      "learning_rate": 1e-05,
+      "loss": 0.0249,
+      "num_tokens": 58972453.0,
+      "reward": 0.484375,
+      "reward_std": 0.3356248140335083,
+      "rewards/accuracy_reward/mean": 0.484375,
+      "rewards/accuracy_reward/std": 0.5037065148353577,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999274611473083,
+      "sampling/importance_sampling_ratio/min": 0.08670976012945175,
+      "sampling/sampling_logp_difference/max": 2.4451887607574463,
+      "sampling/sampling_logp_difference/mean": 0.01583728939294815,
+      "step": 171
+    },
+    {
+      "clip_ratio/high_max": 0.00019820678062387742,
+      "clip_ratio/high_mean": 7.964204723975854e-05,
+      "clip_ratio/low_mean": 0.00010668787808754132,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0001863299248725525,
+      "completions/clipped_ratio": 0.109375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14688.0,
+      "completions/mean_length": 7339.859375,
+      "completions/mean_terminated_length": 6229.17529296875,
+      "completions/min_length": 749.0,
+      "completions/min_terminated_length": 749.0,
+      "entropy": 0.588470846414566,
+      "epoch": 0.07911683532658693,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0008887532167136669,
+      "learning_rate": 1e-05,
+      "loss": 0.0996,
+      "num_tokens": 59451860.0,
+      "reward": 0.578125,
+      "reward_std": 0.2777610421180725,
+      "rewards/accuracy_reward/mean": 0.578125,
+      "rewards/accuracy_reward/std": 0.49776285886764526,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999399185180664,
+      "sampling/importance_sampling_ratio/min": 0.07705886662006378,
+      "sampling/sampling_logp_difference/max": 2.563185691833496,
+      "sampling/sampling_logp_difference/mean": 0.02082424983382225,
+      "step": 172
+    },
+    {
+      "clip_ratio/high_max": 0.00018353125824432936,
+      "clip_ratio/high_mean": 4.588281456108234e-05,
+      "clip_ratio/low_mean": 0.0001578366407102294,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0002037194581134827,
+      "completions/clipped_ratio": 0.09375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15312.0,
+      "completions/mean_length": 5281.125,
+      "completions/mean_terminated_length": 4132.5517578125,
+      "completions/min_length": 509.0,
+      "completions/min_terminated_length": 509.0,
+      "entropy": 0.4296277277171612,
+      "epoch": 0.07957681692732291,
+      "frac_reward_zero_std": 0.125,
+      "grad_norm": 0.001513984752818942,
+      "learning_rate": 1e-05,
+      "loss": -0.0775,
+      "num_tokens": 59799380.0,
+      "reward": 0.515625,
+      "reward_std": 0.4024401307106018,
+      "rewards/accuracy_reward/mean": 0.515625,
+      "rewards/accuracy_reward/std": 0.5037065148353577,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999828934669495,
+      "sampling/importance_sampling_ratio/min": 0.008889591321349144,
+      "sampling/sampling_logp_difference/max": 4.722874164581299,
+      "sampling/sampling_logp_difference/mean": 0.018146134912967682,
+      "step": 173
+    },
+    {
+      "clip_ratio/high_max": 9.688743193692062e-05,
+      "clip_ratio/high_mean": 2.891019539674744e-05,
+      "clip_ratio/low_mean": 0.00018989583531947574,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.00021880603753743344,
+      "completions/clipped_ratio": 0.109375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15439.0,
+      "completions/mean_length": 6502.078125,
+      "completions/mean_terminated_length": 5288.5087890625,
+      "completions/min_length": 646.0,
+      "completions/min_terminated_length": 646.0,
+      "entropy": 0.5399197302758694,
+      "epoch": 0.08003679852805888,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0024728032294660807,
+      "learning_rate": 1e-05,
+      "loss": 0.042,
+      "num_tokens": 60224913.0,
+      "reward": 0.578125,
+      "reward_std": 0.23568853735923767,
+      "rewards/accuracy_reward/mean": 0.578125,
+      "rewards/accuracy_reward/std": 0.49776285886764526,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999677538871765,
+      "sampling/importance_sampling_ratio/min": 0.16374215483665466,
+      "sampling/sampling_logp_difference/max": 1.809462308883667,
+      "sampling/sampling_logp_difference/mean": 0.018581923097372055,
+      "step": 174
+    },
+    {
+      "clip_ratio/high_max": 0.00013797258361591958,
+      "clip_ratio/high_mean": 3.965495989177725e-05,
+      "clip_ratio/low_mean": 0.0002399629820502014,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0002796179478536942,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16090.0,
+      "completions/mean_length": 5760.640625,
+      "completions/mean_terminated_length": 5417.95166015625,
+      "completions/min_length": 779.0,
+      "completions/min_terminated_length": 779.0,
+      "entropy": 0.4306669682264328,
+      "epoch": 0.08049678012879485,
+      "frac_reward_zero_std": 0.125,
+      "grad_norm": 0.0021515442058444023,
+      "learning_rate": 1e-05,
+      "loss": 0.0035,
+      "num_tokens": 60602346.0,
+      "reward": 0.59375,
+      "reward_std": 0.3777071237564087,
+      "rewards/accuracy_reward/mean": 0.59375,
+      "rewards/accuracy_reward/std": 0.49501484632492065,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000123977661133,
+      "sampling/importance_sampling_ratio/min": 0.07492339611053467,
+      "sampling/sampling_logp_difference/max": 2.5912890434265137,
+      "sampling/sampling_logp_difference/mean": 0.018847323954105377,
+      "step": 175
+    },
+    {
+      "clip_ratio/high_max": 0.00012638730822800426,
+      "clip_ratio/high_mean": 3.932306321985379e-05,
+      "clip_ratio/low_mean": 0.00021857243700651452,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.00025789550363697344,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15591.0,
+      "completions/mean_length": 5001.828125,
+      "completions/mean_terminated_length": 4442.048828125,
+      "completions/min_length": 854.0,
+      "completions/min_terminated_length": 854.0,
+      "entropy": 0.386859655380249,
+      "epoch": 0.08095676172953081,
+      "frac_reward_zero_std": 0.125,
+      "grad_norm": 0.002064446220174432,
+      "learning_rate": 1e-05,
+      "loss": 0.0518,
+      "num_tokens": 60933319.0,
+      "reward": 0.671875,
+      "reward_std": 0.4024401307106018,
+      "rewards/accuracy_reward/mean": 0.671875,
+      "rewards/accuracy_reward/std": 0.4732423722743988,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999225735664368,
+      "sampling/importance_sampling_ratio/min": 0.001200833241455257,
+      "sampling/sampling_logp_difference/max": 6.7247395515441895,
+      "sampling/sampling_logp_difference/mean": 0.015642710030078888,
+      "step": 176
+    },
+    {
+      "clip_ratio/high_max": 0.00010242240023217164,
+      "clip_ratio/high_mean": 2.9891118629166158e-05,
+      "clip_ratio/low_mean": 4.038109773318865e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 7.027221681710216e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 11816.0,
+      "completions/max_terminated_length": 11816.0,
+      "completions/mean_length": 3618.96875,
+      "completions/mean_terminated_length": 3618.96875,
+      "completions/min_length": 539.0,
+      "completions/min_terminated_length": 539.0,
+      "entropy": 0.5046555213630199,
+      "epoch": 0.08141674333026679,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0018260818906128407,
+      "learning_rate": 1e-05,
+      "loss": -0.1102,
+      "num_tokens": 61174021.0,
+      "reward": 0.65625,
+      "reward_std": 0.2925041913986206,
+      "rewards/accuracy_reward/mean": 0.65625,
+      "rewards/accuracy_reward/std": 0.4787135720252991,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.000065803527832,
+      "sampling/importance_sampling_ratio/min": 0.4336312413215637,
+      "sampling/sampling_logp_difference/max": 0.8355607986450195,
+      "sampling/sampling_logp_difference/mean": 0.01464638952165842,
+      "step": 177
+    },
+    {
+      "clip_ratio/high_max": 0.0001466885069021373,
+      "clip_ratio/high_mean": 4.841328131988121e-05,
+      "clip_ratio/low_mean": 0.00021246483720460674,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0002608781214803457,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15898.0,
+      "completions/max_terminated_length": 15898.0,
+      "completions/mean_length": 6726.5625,
+      "completions/mean_terminated_length": 6726.5625,
+      "completions/min_length": 792.0,
+      "completions/min_terminated_length": 792.0,
+      "entropy": 0.529183816164732,
+      "epoch": 0.08187672493100276,
+      "frac_reward_zero_std": 0.125,
+      "grad_norm": 0.0017305320361629128,
+      "learning_rate": 1e-05,
+      "loss": 0.1277,
+      "num_tokens": 61614929.0,
+      "reward": 0.546875,
+      "reward_std": 0.398196816444397,
+      "rewards/accuracy_reward/mean": 0.546875,
+      "rewards/accuracy_reward/std": 0.501733124256134,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999656081199646,
+      "sampling/importance_sampling_ratio/min": 0.008894972503185272,
+      "sampling/sampling_logp_difference/max": 4.722269058227539,
+      "sampling/sampling_logp_difference/mean": 0.01910485327243805,
+      "step": 178
+    },
+    {
+      "clip_ratio/high_max": 7.97040474935784e-05,
+      "clip_ratio/high_mean": 1.99260118733946e-05,
+      "clip_ratio/low_mean": 0.0001781473129085498,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.00019807332500931807,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 12765.0,
+      "completions/mean_length": 4369.1875,
+      "completions/mean_terminated_length": 3778.294921875,
+      "completions/min_length": 638.0,
+      "completions/min_terminated_length": 638.0,
+      "entropy": 0.41936155781149864,
+      "epoch": 0.08233670653173873,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0009695704793557525,
+      "learning_rate": 1e-05,
+      "loss": 0.0105,
+      "num_tokens": 61905869.0,
+      "reward": 0.375,
+      "reward_std": 0.2961388826370239,
+      "rewards/accuracy_reward/mean": 0.375,
+      "rewards/accuracy_reward/std": 0.48795005679130554,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000646114349365,
+      "sampling/importance_sampling_ratio/min": 0.08589151501655579,
+      "sampling/sampling_logp_difference/max": 2.4546701908111572,
+      "sampling/sampling_logp_difference/mean": 0.014310698956251144,
+      "step": 179
+    },
+    {
+      "clip_ratio/high_max": 0.00015925395200611092,
+      "clip_ratio/high_mean": 5.9038932477051276e-05,
+      "clip_ratio/low_mean": 0.00033926496098501957,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0003983038918704551,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 13708.0,
+      "completions/max_terminated_length": 13708.0,
+      "completions/mean_length": 4426.296875,
+      "completions/mean_terminated_length": 4426.296875,
+      "completions/min_length": 724.0,
+      "completions/min_terminated_length": 724.0,
+      "entropy": 0.3644730970263481,
+      "epoch": 0.0827966881324747,
+      "frac_reward_zero_std": 0.125,
+      "grad_norm": 0.0037218655925244093,
+      "learning_rate": 1e-05,
+      "loss": 0.015,
+      "num_tokens": 62198704.0,
+      "reward": 0.671875,
+      "reward_std": 0.38194066286087036,
+      "rewards/accuracy_reward/mean": 0.671875,
+      "rewards/accuracy_reward/std": 0.4732423722743988,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999805688858032,
+      "sampling/importance_sampling_ratio/min": 0.17468461394309998,
+      "sampling/sampling_logp_difference/max": 1.7447731494903564,
+      "sampling/sampling_logp_difference/mean": 0.01487559825181961,
+      "step": 180
+    },
+    {
+      "clip_ratio/high_max": 8.359176717931405e-05,
+      "clip_ratio/high_mean": 3.191635573784879e-05,
+      "clip_ratio/low_mean": 0.0004070220602443442,
+      "clip_ratio/low_min": 3.077680594287813e-05,
+      "clip_ratio/region_mean": 0.0004389384193927981,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16178.0,
+      "completions/mean_length": 5812.25,
+      "completions/mean_terminated_length": 5292.32763671875,
+      "completions/min_length": 634.0,
+      "completions/min_terminated_length": 634.0,
+      "entropy": 0.45648736134171486,
+      "epoch": 0.08325666973321068,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.0027508491184562445,
+      "learning_rate": 1e-05,
+      "loss": 0.031,
+      "num_tokens": 62580064.0,
+      "reward": 0.421875,
+      "reward_std": 0.4650121033191681,
+      "rewards/accuracy_reward/mean": 0.421875,
+      "rewards/accuracy_reward/std": 0.49776285886764526,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000232458114624,
+      "sampling/importance_sampling_ratio/min": 0.08442410081624985,
+      "sampling/sampling_logp_difference/max": 2.471902370452881,
+      "sampling/sampling_logp_difference/mean": 0.016401425004005432,
+      "step": 181
+    },
+    {
+      "clip_ratio/high_max": 4.4125090425950475e-05,
+      "clip_ratio/high_mean": 1.1031272606487619e-05,
+      "clip_ratio/low_mean": 7.451805299751868e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 8.554932537663262e-05,
+      "completions/clipped_ratio": 0.109375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15625.0,
+      "completions/mean_length": 7307.546875,
+      "completions/mean_terminated_length": 6219.73681640625,
+      "completions/min_length": 664.0,
+      "completions/min_terminated_length": 664.0,
+      "entropy": 0.6124343201518059,
+      "epoch": 0.08371665133394664,
+      "frac_reward_zero_std": 0.75,
+      "grad_norm": 0.0018599852919578552,
+      "learning_rate": 1e-05,
+      "loss": -0.0063,
+      "num_tokens": 63064195.0,
+      "reward": 0.234375,
+      "reward_std": 0.1315089464187622,
+      "rewards/accuracy_reward/mean": 0.234375,
+      "rewards/accuracy_reward/std": 0.42695629596710205,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999361038208008,
+      "sampling/importance_sampling_ratio/min": 2.8124927098360786e-07,
+      "sampling/sampling_logp_difference/max": 15.084024429321289,
+      "sampling/sampling_logp_difference/mean": 0.02131102979183197,
+      "step": 182
+    },
+    {
+      "clip_ratio/high_max": 0.00011692623138515046,
+      "clip_ratio/high_mean": 4.184022350273153e-05,
+      "clip_ratio/low_mean": 0.00020591235761457938,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.00024775257406872697,
+      "completions/clipped_ratio": 0.078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16140.0,
+      "completions/mean_length": 6444.59375,
+      "completions/mean_terminated_length": 5602.27099609375,
+      "completions/min_length": 460.0,
+      "completions/min_terminated_length": 460.0,
+      "entropy": 0.3952566310763359,
+      "epoch": 0.08417663293468261,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0023683710023760796,
+      "learning_rate": 1e-05,
+      "loss": 0.1244,
+      "num_tokens": 63488401.0,
+      "reward": 0.671875,
+      "reward_std": 0.28930896520614624,
+      "rewards/accuracy_reward/mean": 0.671875,
+      "rewards/accuracy_reward/std": 0.4732423722743988,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.000025987625122,
+      "sampling/importance_sampling_ratio/min": 0.0319228358566761,
+      "sampling/sampling_logp_difference/max": 3.4444336891174316,
+      "sampling/sampling_logp_difference/mean": 0.016494080424308777,
+      "step": 183
+    },
+    {
+      "clip_ratio/high_max": 0.0004113621180295013,
+      "clip_ratio/high_mean": 0.00013114491684973473,
+      "clip_ratio/low_mean": 0.00016759118079789914,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.00029873609946662327,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15472.0,
+      "completions/max_terminated_length": 15472.0,
+      "completions/mean_length": 4065.046875,
+      "completions/mean_terminated_length": 4065.046875,
+      "completions/min_length": 761.0,
+      "completions/min_terminated_length": 761.0,
+      "entropy": 0.4868856966495514,
+      "epoch": 0.08463661453541858,
+      "frac_reward_zero_std": 0.125,
+      "grad_norm": 0.0024668388068675995,
+      "learning_rate": 1e-05,
+      "loss": 0.0968,
+      "num_tokens": 63757052.0,
+      "reward": 0.59375,
+      "reward_std": 0.400318443775177,
+      "rewards/accuracy_reward/mean": 0.59375,
+      "rewards/accuracy_reward/std": 0.49501484632492065,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999004006385803,
+      "sampling/importance_sampling_ratio/min": 0.17345686256885529,
+      "sampling/sampling_logp_difference/max": 1.751826286315918,
+      "sampling/sampling_logp_difference/mean": 0.016633857041597366,
+      "step": 184
+    },
+    {
+      "clip_ratio/high_max": 0.00011292714589217212,
+      "clip_ratio/high_mean": 3.753670944206533e-05,
+      "clip_ratio/low_mean": 0.00019003584657184547,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.00022757255737815285,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14510.0,
+      "completions/mean_length": 4484.328125,
+      "completions/mean_terminated_length": 3899.09814453125,
+      "completions/min_length": 728.0,
+      "completions/min_terminated_length": 728.0,
+      "entropy": 0.42744625359773636,
+      "epoch": 0.08509659613615456,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0025180387310683727,
+      "learning_rate": 1e-05,
+      "loss": 0.0264,
+      "num_tokens": 64056361.0,
+      "reward": 0.65625,
+      "reward_std": 0.3335031569004059,
+      "rewards/accuracy_reward/mean": 0.65625,
+      "rewards/accuracy_reward/std": 0.4787135720252991,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.000048279762268,
+      "sampling/importance_sampling_ratio/min": 0.10562728345394135,
+      "sampling/sampling_logp_difference/max": 2.2478384971618652,
+      "sampling/sampling_logp_difference/mean": 0.016854196786880493,
+      "step": 185
+    },
+    {
+      "clip_ratio/high_max": 0.0001615245364519069,
+      "clip_ratio/high_mean": 5.795405149910948e-05,
+      "clip_ratio/low_mean": 0.0002756438070719014,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.00033359785993525293,
+      "completions/clipped_ratio": 0.078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 13914.0,
+      "completions/mean_length": 6078.625,
+      "completions/mean_terminated_length": 5205.2880859375,
+      "completions/min_length": 643.0,
+      "completions/min_terminated_length": 643.0,
+      "entropy": 0.44200121238827705,
+      "epoch": 0.08555657773689053,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.003769052680581808,
+      "learning_rate": 1e-05,
+      "loss": 0.035,
+      "num_tokens": 64456505.0,
+      "reward": 0.609375,
+      "reward_std": 0.34246450662612915,
+      "rewards/accuracy_reward/mean": 0.609375,
+      "rewards/accuracy_reward/std": 0.4917473793029785,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000364780426025,
+      "sampling/importance_sampling_ratio/min": 0.05654711276292801,
+      "sampling/sampling_logp_difference/max": 2.872681140899658,
+      "sampling/sampling_logp_difference/mean": 0.019126243889331818,
+      "step": 186
+    },
+    {
+      "clip_ratio/high_max": 7.941897001728648e-05,
+      "clip_ratio/high_mean": 2.403534449513245e-05,
+      "clip_ratio/low_mean": 0.00031823053905100096,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0003422658846830018,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14135.0,
+      "completions/mean_length": 5394.234375,
+      "completions/mean_terminated_length": 4853.75390625,
+      "completions/min_length": 277.0,
+      "completions/min_terminated_length": 277.0,
+      "entropy": 0.5750439912080765,
+      "epoch": 0.08601655933762649,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0011502739507704973,
+      "learning_rate": 1e-05,
+      "loss": 0.008,
+      "num_tokens": 64812360.0,
+      "reward": 0.3125,
+      "reward_std": 0.3335031569004059,
+      "rewards/accuracy_reward/mean": 0.3125,
+      "rewards/accuracy_reward/std": 0.467176616191864,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000646114349365,
+      "sampling/importance_sampling_ratio/min": 1.883326774532179e-07,
+      "sampling/sampling_logp_difference/max": 15.485055923461914,
+      "sampling/sampling_logp_difference/mean": 0.01883636973798275,
+      "step": 187
+    },
+    {
+      "clip_ratio/high_max": 0.00017953455608221702,
+      "clip_ratio/high_mean": 5.071163013781188e-05,
+      "clip_ratio/low_mean": 0.00021184172874200158,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.00026255335797031876,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 11992.0,
+      "completions/mean_length": 3735.09375,
+      "completions/mean_terminated_length": 3113.01611328125,
+      "completions/min_length": 1068.0,
+      "completions/min_terminated_length": 1068.0,
+      "entropy": 0.3347371593117714,
+      "epoch": 0.08647654093836246,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.005435355007648468,
+      "learning_rate": 1e-05,
+      "loss": 0.2343,
+      "num_tokens": 65060254.0,
+      "reward": 0.703125,
+      "reward_std": 0.35141605138778687,
+      "rewards/accuracy_reward/mean": 0.703125,
+      "rewards/accuracy_reward/std": 0.4604927599430084,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000994205474854,
+      "sampling/importance_sampling_ratio/min": 0.3014819324016571,
+      "sampling/sampling_logp_difference/max": 1.3178880214691162,
+      "sampling/sampling_logp_difference/mean": 0.011843510903418064,
+      "step": 188
+    },
+    {
+      "clip_ratio/high_max": 0.00017920508889801567,
+      "clip_ratio/high_mean": 4.987927036381734e-05,
+      "clip_ratio/low_mean": 0.0003012043116541463,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.00035108358360957936,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 12396.0,
+      "completions/max_terminated_length": 12396.0,
+      "completions/mean_length": 3665.59375,
+      "completions/mean_terminated_length": 3665.59375,
+      "completions/min_length": 569.0,
+      "completions/min_terminated_length": 569.0,
+      "entropy": 0.3805939368903637,
+      "epoch": 0.08693652253909843,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0012891854858025908,
+      "learning_rate": 1e-05,
+      "loss": 0.0735,
+      "num_tokens": 65303860.0,
+      "reward": 0.671875,
+      "reward_std": 0.2777610421180725,
+      "rewards/accuracy_reward/mean": 0.671875,
+      "rewards/accuracy_reward/std": 0.4732423722743988,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999483823776245,
+      "sampling/importance_sampling_ratio/min": 0.1537669152021408,
+      "sampling/sampling_logp_difference/max": 1.8723173141479492,
+      "sampling/sampling_logp_difference/mean": 0.014232308603823185,
+      "step": 189
+    },
+    {
+      "clip_ratio/high_max": 0.00025017756706802174,
+      "clip_ratio/high_mean": 9.517185890217661e-05,
+      "clip_ratio/low_mean": 0.0002550413046265021,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.00035021316580241546,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 14472.0,
+      "completions/max_terminated_length": 14472.0,
+      "completions/mean_length": 4944.328125,
+      "completions/mean_terminated_length": 4944.328125,
+      "completions/min_length": 341.0,
+      "completions/min_terminated_length": 341.0,
+      "entropy": 0.4690123051404953,
+      "epoch": 0.08739650413983441,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.005588939413428307,
+      "learning_rate": 1e-05,
+      "loss": -0.0124,
+      "num_tokens": 65629153.0,
+      "reward": 0.625,
+      "reward_std": 0.44663429260253906,
+      "rewards/accuracy_reward/mean": 0.625,
+      "rewards/accuracy_reward/std": 0.48795005679130554,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999592900276184,
+      "sampling/importance_sampling_ratio/min": 0.25745633244514465,
+      "sampling/sampling_logp_difference/max": 1.356905221939087,
+      "sampling/sampling_logp_difference/mean": 0.018515609204769135,
+      "step": 190
+    },
+    {
+      "clip_ratio/high_max": 0.00018033986088994425,
+      "clip_ratio/high_mean": 6.675588315374625e-05,
+      "clip_ratio/low_mean": 0.00043161257781321183,
+      "clip_ratio/low_min": 7.48055026633665e-05,
+      "clip_ratio/region_mean": 0.0004983684593753424,
+      "completions/clipped_ratio": 0.078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16026.0,
+      "completions/mean_length": 6431.15625,
+      "completions/mean_terminated_length": 5587.69482421875,
+      "completions/min_length": 772.0,
+      "completions/min_terminated_length": 772.0,
+      "entropy": 0.42928578704595566,
+      "epoch": 0.08785648574057038,
+      "frac_reward_zero_std": 0.125,
+      "grad_norm": 0.0025411290116608143,
+      "learning_rate": 1e-05,
+      "loss": 0.1312,
+      "num_tokens": 66050187.0,
+      "reward": 0.53125,
+      "reward_std": 0.3956102132797241,
+      "rewards/accuracy_reward/mean": 0.53125,
+      "rewards/accuracy_reward/std": 0.5029674172401428,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.000171184539795,
+      "sampling/importance_sampling_ratio/min": 0.025968920439481735,
+      "sampling/sampling_logp_difference/max": 3.6508548259735107,
+      "sampling/sampling_logp_difference/mean": 0.018628142774105072,
+      "step": 191
+    },
+    {
+      "clip_ratio/high_max": 0.000127093402625178,
+      "clip_ratio/high_mean": 5.626492793453508e-05,
+      "clip_ratio/low_mean": 0.00029342325888137566,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0003496881909086369,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15049.0,
+      "completions/mean_length": 4280.375,
+      "completions/mean_terminated_length": 3889.935302734375,
+      "completions/min_length": 572.0,
+      "completions/min_terminated_length": 572.0,
+      "entropy": 0.37104332447052,
+      "epoch": 0.08831646734130635,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.002807023236528039,
+      "learning_rate": 1e-05,
+      "loss": 0.1072,
+      "num_tokens": 66332643.0,
+      "reward": 0.625,
+      "reward_std": 0.44663429260253906,
+      "rewards/accuracy_reward/mean": 0.625,
+      "rewards/accuracy_reward/std": 0.48795005679130554,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.000070571899414,
+      "sampling/importance_sampling_ratio/min": 0.29366788268089294,
+      "sampling/sampling_logp_difference/max": 1.2253057956695557,
+      "sampling/sampling_logp_difference/mean": 0.014485626481473446,
+      "step": 192
+    }
+  ],
+  "logging_steps": 1,
+  "max_steps": 1024,
+  "num_input_tokens_seen": 66332643,
+  "num_train_epochs": 1,
+  "save_steps": 64,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 0.0,
+  "train_batch_size": 2,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/dapo_lora_7b_20251202_002719/checkpoint-256/latest b/dapo_lora_7b_20251202_002719/checkpoint-256/latest
new file mode 100644
index 0000000000000000000000000000000000000000..b747f9725067064e241a7a3bed90583971af8ad1
--- /dev/null
+++ b/dapo_lora_7b_20251202_002719/checkpoint-256/latest
@@ -0,0 +1 @@
+global_step256
\ No newline at end of file
diff --git a/dapo_lora_7b_20251202_002719/checkpoint-256/special_tokens_map.json b/dapo_lora_7b_20251202_002719/checkpoint-256/special_tokens_map.json
new file mode 100644
index 0000000000000000000000000000000000000000..1d385d62cf08bca35254547902b792c243656ec1
--- /dev/null
+++ b/dapo_lora_7b_20251202_002719/checkpoint-256/special_tokens_map.json
@@ -0,0 +1,23 @@
+{
+  "bos_token": {
+    "content": "<｜begin▁of▁sentence｜>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "<｜end▁of▁sentence｜>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<｜end▁of▁sentence｜>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}
diff --git a/dapo_lora_7b_20251202_002719/checkpoint-256/tokenizer_config.json b/dapo_lora_7b_20251202_002719/checkpoint-256/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..d252dd4e5764106823080946500c02a8ed8c90c9
--- /dev/null
+++ b/dapo_lora_7b_20251202_002719/checkpoint-256/tokenizer_config.json
@@ -0,0 +1,194 @@
+{
+  "add_bos_token": true,
+  "add_eos_token": false,
+  "add_prefix_space": null,
+  "added_tokens_decoder": {
+    "151643": {
+      "content": "<｜end▁of▁sentence｜>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151644": {
+      "content": "<｜User｜>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151645": {
+      "content": "<｜Assistant｜>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151646": {
+      "content": "<｜begin▁of▁sentence｜>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151647": {
+      "content": "<|EOT|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151648": {
+      "content": "<think>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151649": {
+      "content": "</think>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151650": {
+      "content": "<|quad_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151651": {
+      "content": "<|quad_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151652": {
+      "content": "<|vision_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151653": {
+      "content": "<|vision_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151654": {
+      "content": "<|vision_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151655": {
+      "content": "<|image_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151656": {
+      "content": "<|video_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151657": {
+      "content": "<tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151658": {
+      "content": "</tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151659": {
+      "content": "<|fim_prefix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151660": {
+      "content": "<|fim_middle|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151661": {
+      "content": "<|fim_suffix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151662": {
+      "content": "<|fim_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151663": {
+      "content": "<|repo_name|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151664": {
+      "content": "<|file_sep|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    }
+  },
+  "bos_token": "<｜begin▁of▁sentence｜>",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<｜end▁of▁sentence｜>",
+  "extra_special_tokens": {},
+  "legacy": true,
+  "model_max_length": 16384,
+  "pad_token": "<｜end▁of▁sentence｜>",
+  "sp_model_kwargs": {},
+  "tokenizer_class": "LlamaTokenizerFast",
+  "unk_token": null,
+  "use_default_system_prompt": false
+}
diff --git a/dapo_lora_7b_20251202_002719/checkpoint-256/zero_to_fp32.py b/dapo_lora_7b_20251202_002719/checkpoint-256/zero_to_fp32.py
new file mode 100644
index 0000000000000000000000000000000000000000..5995d6e6f04e43b989587aa9022a3aef0c66d694
--- /dev/null
+++ b/dapo_lora_7b_20251202_002719/checkpoint-256/zero_to_fp32.py
@@ -0,0 +1,760 @@
+#!/usr/bin/env python
+
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets
+# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in
+# the future. Once extracted, the weights don't require DeepSpeed and can be used in any
+# application.
+#
+# example:
+#   python zero_to_fp32.py . output_dir/
+#   or
+#   python zero_to_fp32.py . output_dir/ --safe_serialization
+
+import argparse
+import torch
+import glob
+import math
+import os
+import re
+import gc
+import json
+import numpy as np
+from tqdm import tqdm
+from collections import OrderedDict
+from dataclasses import dataclass
+
+# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with
+# DeepSpeed data structures it has to be available in the current python environment.
+from deepspeed.utils import logger
+from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS,
+                                            FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES,
+                                            FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS)
+
+
+@dataclass
+class zero_model_state:
+    buffers: dict()
+    param_shapes: dict()
+    shared_params: list
+    ds_version: int
+    frozen_param_shapes: dict()
+    frozen_param_fragments: dict()
+
+
+debug = 0
+
+# load to cpu
+device = torch.device('cpu')
+
+
+def atoi(text):
+    return int(text) if text.isdigit() else text
+
+
+def natural_keys(text):
+    '''
+    alist.sort(key=natural_keys) sorts in human order
+    http://nedbatchelder.com/blog/200712/human_sorting.html
+    (See Toothy's implementation in the comments)
+    '''
+    return [atoi(c) for c in re.split(r'(\d+)', text)]
+
+
+def get_model_state_file(checkpoint_dir, zero_stage):
+    if not os.path.isdir(checkpoint_dir):
+        raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist")
+
+    # there should be only one file
+    if zero_stage <= 2:
+        file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt")
+    elif zero_stage == 3:
+        file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt")
+
+    if not os.path.exists(file):
+        raise FileNotFoundError(f"can't find model states file at '{file}'")
+
+    return file
+
+
+def get_checkpoint_files(checkpoint_dir, glob_pattern):
+    # XXX: need to test that this simple glob rule works for multi-node setup too
+    ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys)
+
+    if len(ckpt_files) == 0:
+        raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'")
+
+    return ckpt_files
+
+
+def get_optim_files(checkpoint_dir):
+    return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt")
+
+
+def get_model_state_files(checkpoint_dir):
+    return get_checkpoint_files(checkpoint_dir, "*_model_states.pt")
+
+
+def parse_model_states(files):
+    zero_model_states = []
+    for file in files:
+        state_dict = torch.load(file, map_location=device, weights_only=False)
+
+        if BUFFER_NAMES not in state_dict:
+            raise ValueError(f"{file} is not a model state checkpoint")
+        buffer_names = state_dict[BUFFER_NAMES]
+        if debug:
+            print("Found buffers:", buffer_names)
+
+        # recover just the buffers while restoring them to fp32 if they were saved in fp16
+        buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names}
+        param_shapes = state_dict[PARAM_SHAPES]
+
+        # collect parameters that are included in param_shapes
+        param_names = []
+        for s in param_shapes:
+            for name in s.keys():
+                param_names.append(name)
+
+        # update with frozen parameters
+        frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None)
+        if frozen_param_shapes is not None:
+            if debug:
+                print(f"Found frozen_param_shapes: {frozen_param_shapes}")
+            param_names += list(frozen_param_shapes.keys())
+
+        # handle shared params
+        shared_params = [[k, v] for k, v in state_dict["shared_params"].items()]
+
+        ds_version = state_dict.get(DS_VERSION, None)
+
+        frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None)
+
+        z_model_state = zero_model_state(buffers=buffers,
+                                         param_shapes=param_shapes,
+                                         shared_params=shared_params,
+                                         ds_version=ds_version,
+                                         frozen_param_shapes=frozen_param_shapes,
+                                         frozen_param_fragments=frozen_param_fragments)
+        zero_model_states.append(z_model_state)
+
+    return zero_model_states
+
+
+def parse_optim_states(files, ds_checkpoint_dir):
+    total_files = len(files)
+    state_dicts = []
+    for f in tqdm(files, desc='Loading checkpoint shards'):
+        state_dict = torch.load(f, map_location=device, mmap=True, weights_only=False)
+        # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights
+        # and also handle the case where it was already removed by another helper script
+        state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None)
+        state_dicts.append(state_dict)
+
+    if ZERO_STAGE not in state_dicts[0][OPTIMIZER_STATE_DICT]:
+        raise ValueError(f"{files[0]} is not a zero checkpoint")
+    zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE]
+    world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT]
+
+    # For ZeRO-2 each param group can have different partition_count as data parallelism for expert
+    # parameters can be different from data parallelism for non-expert parameters. So we can just
+    # use the max of the partition_count to get the dp world_size.
+
+    if type(world_size) is list:
+        world_size = max(world_size)
+
+    if world_size != total_files:
+        raise ValueError(
+            f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. "
+            "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes."
+        )
+
+    # the groups are named differently in each stage
+    if zero_stage <= 2:
+        fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS
+    elif zero_stage == 3:
+        fp32_groups_key = FP32_FLAT_GROUPS
+    else:
+        raise ValueError(f"unknown zero stage {zero_stage}")
+
+    fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))]
+    return zero_stage, world_size, fp32_flat_groups
+
+
+def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters):
+    """
+    Returns fp32 state_dict reconstructed from ds checkpoint
+
+    Args:
+        - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are)
+
+    """
+    print(f"Processing zero checkpoint '{ds_checkpoint_dir}'")
+
+    optim_files = get_optim_files(ds_checkpoint_dir)
+    zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir)
+    print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}")
+
+    model_files = get_model_state_files(ds_checkpoint_dir)
+
+    zero_model_states = parse_model_states(model_files)
+    print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}')
+
+    if zero_stage <= 2:
+        return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+                                                          exclude_frozen_parameters)
+    elif zero_stage == 3:
+        return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+                                                          exclude_frozen_parameters)
+
+
+def _zero2_merge_frozen_params(state_dict, zero_model_states):
+    if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0:
+        return
+
+    frozen_param_shapes = zero_model_states[0].frozen_param_shapes
+    frozen_param_fragments = zero_model_states[0].frozen_param_fragments
+
+    if debug:
+        num_elem = sum(s.numel() for s in frozen_param_shapes.values())
+        print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}')
+
+        wanted_params = len(frozen_param_shapes)
+        wanted_numel = sum(s.numel() for s in frozen_param_shapes.values())
+        avail_numel = sum([p.numel() for p in frozen_param_fragments.values()])
+        print(f'Frozen params: Have {avail_numel} numels to process.')
+        print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params')
+
+    total_params = 0
+    total_numel = 0
+    for name, shape in frozen_param_shapes.items():
+        total_params += 1
+        unpartitioned_numel = shape.numel()
+        total_numel += unpartitioned_numel
+
+        state_dict[name] = frozen_param_fragments[name]
+
+        if debug:
+            print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ")
+
+    print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements")
+
+
+def _has_callable(obj, fn):
+    attr = getattr(obj, fn, None)
+    return callable(attr)
+
+
+def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states):
+    param_shapes = zero_model_states[0].param_shapes
+
+    # Reconstruction protocol:
+    #
+    # XXX: document this
+
+    if debug:
+        for i in range(world_size):
+            for j in range(len(fp32_flat_groups[0])):
+                print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}")
+
+    # XXX: memory usage doubles here (zero2)
+    num_param_groups = len(fp32_flat_groups[0])
+    merged_single_partition_of_fp32_groups = []
+    for i in range(num_param_groups):
+        merged_partitions = [sd[i] for sd in fp32_flat_groups]
+        full_single_fp32_vector = torch.cat(merged_partitions, 0)
+        merged_single_partition_of_fp32_groups.append(full_single_fp32_vector)
+    avail_numel = sum(
+        [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups])
+
+    if debug:
+        wanted_params = sum([len(shapes) for shapes in param_shapes])
+        wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes])
+        # not asserting if there is a mismatch due to possible padding
+        print(f"Have {avail_numel} numels to process.")
+        print(f"Need {wanted_numel} numels in {wanted_params} params.")
+
+    # params
+    # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support
+    # out-of-core computing solution
+    total_numel = 0
+    total_params = 0
+    for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups):
+        offset = 0
+        avail_numel = full_single_fp32_vector.numel()
+        for name, shape in shapes.items():
+
+            unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape)
+            total_numel += unpartitioned_numel
+            total_params += 1
+
+            if debug:
+                print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ")
+            state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape)
+            offset += unpartitioned_numel
+
+        # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and
+        # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex
+        # paddings performed in the code it's almost impossible to predict the exact numbers w/o the
+        # live optimizer object, so we are checking that the numbers are within the right range
+        align_to = 2 * world_size
+
+        def zero2_align(x):
+            return align_to * math.ceil(x / align_to)
+
+        if debug:
+            print(f"original offset={offset}, avail_numel={avail_numel}")
+
+        offset = zero2_align(offset)
+        avail_numel = zero2_align(avail_numel)
+
+        if debug:
+            print(f"aligned  offset={offset}, avail_numel={avail_numel}")
+
+        # Sanity check
+        if offset != avail_numel:
+            raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong")
+
+    print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements")
+
+
+def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+                                               exclude_frozen_parameters):
+    state_dict = OrderedDict()
+
+    # buffers
+    buffers = zero_model_states[0].buffers
+    state_dict.update(buffers)
+    if debug:
+        print(f"added {len(buffers)} buffers")
+
+    if not exclude_frozen_parameters:
+        _zero2_merge_frozen_params(state_dict, zero_model_states)
+
+    _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states)
+
+    # recover shared parameters
+    for pair in zero_model_states[0].shared_params:
+        if pair[1] in state_dict:
+            state_dict[pair[0]] = state_dict[pair[1]]
+
+    return state_dict
+
+
+def zero3_partitioned_param_info(unpartitioned_numel, world_size):
+    remainder = unpartitioned_numel % world_size
+    padding_numel = (world_size - remainder) if remainder else 0
+    partitioned_numel = math.ceil(unpartitioned_numel / world_size)
+    return partitioned_numel, padding_numel
+
+
+def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states):
+    if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0:
+        return
+
+    if debug:
+        for i in range(world_size):
+            num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values())
+            print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}')
+
+        frozen_param_shapes = zero_model_states[0].frozen_param_shapes
+        wanted_params = len(frozen_param_shapes)
+        wanted_numel = sum(s.numel() for s in frozen_param_shapes.values())
+        avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size
+        print(f'Frozen params: Have {avail_numel} numels to process.')
+        print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params')
+
+    total_params = 0
+    total_numel = 0
+    for name, shape in zero_model_states[0].frozen_param_shapes.items():
+        total_params += 1
+        unpartitioned_numel = shape.numel()
+        total_numel += unpartitioned_numel
+
+        param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states)
+        state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape)
+
+        partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size)
+
+        if debug:
+            print(
+                f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}"
+            )
+
+    print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements")
+
+
+class GatheredTensor:
+    """
+    A pseudo tensor that collects partitioned weights.
+    It is more memory efficient when there are multiple groups.
+    """
+
+    def __init__(self, flat_groups, flat_groups_offset, offset, partitioned_numel, shape):
+        self.flat_groups = flat_groups
+        self.flat_groups_offset = flat_groups_offset
+        self.offset = offset
+        self.partitioned_numel = partitioned_numel
+        self.shape = shape
+        self.dtype = self.flat_groups[0][0].dtype
+
+    def contiguous(self):
+        """
+        Merge partitioned weights from flat_groups into a single tensor.
+        """
+        end_idx = self.offset + self.partitioned_numel
+        world_size = len(self.flat_groups)
+        pad_flat_param_chunks = []
+
+        for rank_i in range(world_size):
+            # for each rank, we need to collect weights from related group/groups
+            flat_groups_at_rank_i = self.flat_groups[rank_i]
+            start_group_id = None
+            end_group_id = None
+            for group_id in range(len(self.flat_groups_offset)):
+                if self.flat_groups_offset[group_id] <= self.offset < self.flat_groups_offset[group_id + 1]:
+                    start_group_id = group_id
+                if self.flat_groups_offset[group_id] < end_idx <= self.flat_groups_offset[group_id + 1]:
+                    end_group_id = group_id
+                    break
+            # collect weights from related group/groups
+            for group_id in range(start_group_id, end_group_id + 1):
+                flat_tensor = flat_groups_at_rank_i[group_id]
+                start_offset = self.offset - self.flat_groups_offset[group_id]
+                end_offset = min(end_idx, self.flat_groups_offset[group_id + 1]) - self.flat_groups_offset[group_id]
+                pad_flat_param_chunks.append(flat_tensor[start_offset:end_offset])
+
+        # collect weights from all ranks
+        pad_flat_param = torch.cat(pad_flat_param_chunks, dim=0)
+        param = pad_flat_param[:self.shape.numel()].view(self.shape).contiguous()
+        return param
+
+
+def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states):
+    param_shapes = zero_model_states[0].param_shapes
+    avail_numel = sum([flat_group.numel() for flat_group in fp32_flat_groups[0]]) * world_size
+
+    # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each
+    # param, re-consolidating each param, while dealing with padding if any
+
+    # merge list of dicts, preserving order
+    param_shapes = {k: v for d in param_shapes for k, v in d.items()}
+
+    if debug:
+        for i in range(world_size):
+            print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}")
+
+        wanted_params = len(param_shapes)
+        wanted_numel = sum(shape.numel() for shape in param_shapes.values())
+        # not asserting if there is a mismatch due to possible padding
+        avail_numel = fp32_flat_groups[0].numel() * world_size
+        print(f"Trainable params: Have {avail_numel} numels to process.")
+        print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.")
+
+    # params
+    # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support
+    # out-of-core computing solution
+    offset = 0
+    total_numel = 0
+    total_params = 0
+    flat_groups_offset = [0] + list(np.cumsum([flat_tensor.numel() for flat_tensor in fp32_flat_groups[0]]))
+    for name, shape in tqdm(param_shapes.items(), desc='Gathering sharded weights'):
+        unpartitioned_numel = shape.numel()
+        total_numel += unpartitioned_numel
+        total_params += 1
+        partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size)
+
+        if debug:
+            print(
+                f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}"
+            )
+
+        # memory efficient tensor
+        tensor = GatheredTensor(fp32_flat_groups, flat_groups_offset, offset, partitioned_numel, shape)
+        state_dict[name] = tensor
+        offset += partitioned_numel
+
+    offset *= world_size
+
+    # Sanity check
+    if offset != avail_numel:
+        raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong")
+
+    print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements")
+
+
+def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+                                               exclude_frozen_parameters):
+    state_dict = OrderedDict()
+
+    # buffers
+    buffers = zero_model_states[0].buffers
+    state_dict.update(buffers)
+    if debug:
+        print(f"added {len(buffers)} buffers")
+
+    if not exclude_frozen_parameters:
+        _zero3_merge_frozen_params(state_dict, world_size, zero_model_states)
+
+    _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states)
+
+    # recover shared parameters
+    for pair in zero_model_states[0].shared_params:
+        if pair[1] in state_dict:
+            state_dict[pair[0]] = state_dict[pair[1]]
+
+    return state_dict
+
+
+def to_torch_tensor(state_dict, return_empty_tensor=False):
+    """
+    Convert state_dict of GatheredTensor to torch tensor
+    """
+    torch_state_dict = {}
+    converted_tensors = {}
+    for name, tensor in state_dict.items():
+        tensor_id = id(tensor)
+        if tensor_id in converted_tensors:  # shared tensors
+            shared_tensor = torch_state_dict[converted_tensors[tensor_id]]
+            torch_state_dict[name] = shared_tensor
+        else:
+            converted_tensors[tensor_id] = name
+            if return_empty_tensor:
+                torch_state_dict[name] = torch.empty(tensor.shape, dtype=tensor.dtype)
+            else:
+                torch_state_dict[name] = tensor.contiguous()
+    return torch_state_dict
+
+
+def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir,
+                                             tag=None,
+                                             exclude_frozen_parameters=False,
+                                             lazy_mode=False):
+    """
+    Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with
+    ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example
+    via a model hub.
+
+    Args:
+        - ``checkpoint_dir``: path to the desired checkpoint folder
+        - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14``
+        - ``exclude_frozen_parameters``: exclude frozen parameters
+        - ``lazy_mode``: get state_dict in lazy mode. It returns a dict of pesduo tensor instead of torch tensor, which is more memory efficient.
+          Convert the pesduo tensor to torch tensor by ``.contiguous()``
+
+    Returns:
+        - pytorch ``state_dict``
+
+    A typical usage might be ::
+
+        from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint
+        # do the training and checkpoint saving
+        state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu
+        model = model.cpu() # move to cpu
+        model.load_state_dict(state_dict)
+        # submit to model hub or save the model to share with others
+
+    In this example the ``model`` will no longer be usable in the deepspeed context of the same
+    application. i.e. you will need to re-initialize the deepspeed engine, since
+    ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it.
+
+    If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead.
+
+    Note: the above usage may not work if your application doesn't have sufficient free CPU memory.
+    You may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with
+    the checkpoint. Or you can load state_dict in lazy mode ::
+
+        from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint
+        state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, lazy_mode=True) # not on cpu
+        for name, lazy_tensor in state_dict.item():
+            tensor = lazy_tensor.contiguous()  # to cpu
+            print(name, tensor)
+            # del tensor to release memory if it no longer in use
+    """
+    if tag is None:
+        latest_path = os.path.join(checkpoint_dir, 'latest')
+        if os.path.isfile(latest_path):
+            with open(latest_path, 'r') as fd:
+                tag = fd.read().strip()
+        else:
+            raise ValueError(f"Unable to find 'latest' file at {latest_path}")
+
+    ds_checkpoint_dir = os.path.join(checkpoint_dir, tag)
+
+    if not os.path.isdir(ds_checkpoint_dir):
+        raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist")
+
+    state_dict = _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters)
+    if lazy_mode:
+        return state_dict
+    else:
+        return to_torch_tensor(state_dict)
+
+
+def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir,
+                                               output_dir,
+                                               max_shard_size="5GB",
+                                               safe_serialization=False,
+                                               tag=None,
+                                               exclude_frozen_parameters=False):
+    """
+    Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be
+    loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed.
+
+    Args:
+        - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``)
+        - ``output_dir``: directory to the pytorch fp32 state_dict output files
+        - ``max_shard_size``: the maximum size for a checkpoint before being sharded, default value is 5GB
+        - ``safe_serialization``:  whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).
+        - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14``
+        - ``exclude_frozen_parameters``: exclude frozen parameters
+    """
+
+    # Dependency pre-check
+    if safe_serialization:
+        try:
+            from safetensors.torch import save_file
+        except ImportError:
+            print('If you want to use `safe_serialization`, please `pip install safetensors`')
+            raise
+    if max_shard_size is not None:
+        try:
+            from huggingface_hub import split_torch_state_dict_into_shards
+        except ImportError:
+            print('If you want to use `max_shard_size`, please `pip install huggingface_hub`')
+            raise
+
+    # Convert zero checkpoint to state_dict
+    state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir,
+                                                          tag,
+                                                          exclude_frozen_parameters,
+                                                          lazy_mode=True)
+
+    # Shard the model if it is too big.
+    weights_name = "model.safetensors" if safe_serialization else "pytorch_model.bin"
+    if max_shard_size is not None:
+        filename_pattern = weights_name.replace(".bin", "{suffix}.bin").replace(".safetensors", "{suffix}.safetensors")
+        # an memory-efficient approach for sharding
+        empty_state_dict = to_torch_tensor(state_dict, return_empty_tensor=True)
+        state_dict_split = split_torch_state_dict_into_shards(empty_state_dict,
+                                                              filename_pattern=filename_pattern,
+                                                              max_shard_size=max_shard_size)
+    else:
+        from collections import namedtuple
+        StateDictSplit = namedtuple("StateDictSplit", ["is_sharded", "filename_to_tensors"])
+        state_dict_split = StateDictSplit(is_sharded=False,
+                                          filename_to_tensors={weights_name: list(state_dict.keys())})
+
+    # Save the model by shard
+    os.makedirs(output_dir, exist_ok=True)
+    filename_to_tensors = state_dict_split.filename_to_tensors.items()
+    for shard_file, tensors in tqdm(filename_to_tensors, desc="Saving checkpoint shards"):
+        shard_state_dict = {tensor_name: state_dict[tensor_name] for tensor_name in tensors}
+        shard_state_dict = to_torch_tensor(shard_state_dict)
+        output_path = os.path.join(output_dir, shard_file)
+        if safe_serialization:
+            save_file(shard_state_dict, output_path, metadata={"format": "pt"})
+        else:
+            torch.save(shard_state_dict, output_path)
+        # release the memory of current shard
+        for tensor_name in list(shard_state_dict.keys()):
+            del state_dict[tensor_name]
+            del shard_state_dict[tensor_name]
+        del shard_state_dict
+        gc.collect()
+
+    # Save index if sharded
+    if state_dict_split.is_sharded:
+        index = {
+            "metadata": state_dict_split.metadata,
+            "weight_map": state_dict_split.tensor_to_filename,
+        }
+        save_index_file = "model.safetensors.index.json" if safe_serialization else "pytorch_model.bin.index.json"
+        save_index_file = os.path.join(output_dir, save_index_file)
+        with open(save_index_file, "w", encoding="utf-8") as f:
+            content = json.dumps(index, indent=2, sort_keys=True) + "\n"
+            f.write(content)
+
+
+def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None):
+    """
+    1. Put the provided model to cpu
+    2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict``
+    3. Load it into the provided model
+
+    Args:
+        - ``model``: the model object to update
+        - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``)
+        - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14``
+
+    Returns:
+        - ``model`: modified model
+
+    Make sure you have plenty of CPU memory available before you call this function. If you don't
+    have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it
+    conveniently placed for you in the checkpoint folder.
+
+    A typical usage might be ::
+
+        from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint
+        model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir)
+        # submit to model hub or save the model to share with others
+
+    Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context
+    of the same application. i.e. you will need to re-initialize the deepspeed engine, since
+    ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it.
+
+    """
+    logger.info("Extracting fp32 weights")
+    state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag)
+
+    logger.info("Overwriting model with fp32 weights")
+    model = model.cpu()
+    model.load_state_dict(state_dict, strict=False)
+
+    return model
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("checkpoint_dir",
+                        type=str,
+                        help="path to the desired checkpoint folder, e.g., path/checkpoint-12")
+    parser.add_argument("output_dir",
+                        type=str,
+                        help="directory to the pytorch fp32 state_dict output files"
+                        "(e.g. path/checkpoint-12-output/)")
+    parser.add_argument(
+        "--max_shard_size",
+        type=str,
+        default="5GB",
+        help="The maximum size for a checkpoint before being sharded. Checkpoints shard will then be each of size"
+        "lower than this size. If expressed as a string, needs to be digits followed by a unit (like `5MB`"
+        "We default it to 5GB in order for models to be able to run easily on free-tier google colab instances"
+        "without CPU OOM issues.")
+    parser.add_argument(
+        "--safe_serialization",
+        default=False,
+        action='store_true',
+        help="Whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).")
+    parser.add_argument("-t",
+                        "--tag",
+                        type=str,
+                        default=None,
+                        help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1")
+    parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters")
+    parser.add_argument("-d", "--debug", action='store_true', help="enable debug")
+    args = parser.parse_args()
+
+    debug = args.debug
+
+    convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir,
+                                               args.output_dir,
+                                               max_shard_size=args.max_shard_size,
+                                               safe_serialization=args.safe_serialization,
+                                               tag=args.tag,
+                                               exclude_frozen_parameters=args.exclude_frozen_parameters)
diff --git a/dapo_lora_7b_20251202_002719/checkpoint-64/README.md b/dapo_lora_7b_20251202_002719/checkpoint-64/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..93603607c9fb9b3b4d2aece2cf11d1492643ced8
--- /dev/null
+++ b/dapo_lora_7b_20251202_002719/checkpoint-64/README.md
@@ -0,0 +1,209 @@
+---
+base_model: deepseek-ai/DeepSeek-R1-Distill-Qwen-7B
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:deepseek-ai/DeepSeek-R1-Distill-Qwen-7B
+- grpo
+- lora
+- transformers
+- trl
+---
+
+# Model Card for Model ID
+
+<!-- Provide a quick summary of what the model is/does. -->
+
+
+
+## Model Details
+
+### Model Description
+
+<!-- Provide a longer summary of what this model is. -->
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+<!-- Provide the basic links for the model. -->
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+
+### Direct Use
+
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+
+[More Information Needed]
+
+### Recommendations
+
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+
+[More Information Needed]
+
+### Training Procedure
+
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+
+#### Speeds, Sizes, Times [optional]
+
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+
+[More Information Needed]
+
+## Evaluation
+
+<!-- This section describes the evaluation protocols and provides the results. -->
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+<!-- This should link to a Dataset Card if possible. -->
+
+[More Information Needed]
+
+#### Factors
+
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+
+[More Information Needed]
+
+#### Metrics
+
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+<!-- Relevant interpretability work for the model goes here -->
+
+[More Information Needed]
+
+## Environmental Impact
+
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.17.1
\ No newline at end of file
diff --git a/dapo_lora_7b_20251202_002719/checkpoint-64/chat_template.jinja b/dapo_lora_7b_20251202_002719/checkpoint-64/chat_template.jinja
new file mode 100644
index 0000000000000000000000000000000000000000..c2066bd7391c270626e39c9d7124f00360126412
--- /dev/null
+++ b/dapo_lora_7b_20251202_002719/checkpoint-64/chat_template.jinja
@@ -0,0 +1 @@
+{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% set ns = namespace(is_first=false, is_tool=false, is_output_first=true, system_prompt='') %}{%- for message in messages %}{%- if message['role'] == 'system' %}{% set ns.system_prompt = message['content'] %}{%- endif %}{%- endfor %}{{bos_token}}{{ns.system_prompt}}{%- for message in messages %}{%- if message['role'] == 'user' %}{%- set ns.is_tool = false -%}{{'<｜User｜>' + message['content']}}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is none %}{%- set ns.is_tool = false -%}{%- for tool in message['tool_calls']%}{%- if not ns.is_first %}{{'<｜Assistant｜><｜tool▁calls▁begin｜><｜tool▁call▁begin｜>' + tool['type'] + '<｜tool▁sep｜>' + tool['function']['name'] + '\n' + '```json' + '\n' + tool['function']['arguments'] + '\n' + '```' + '<｜tool▁call▁end｜>'}}{%- set ns.is_first = true -%}{%- else %}{{'\n' + '<｜tool▁call▁begin｜>' + tool['type'] + '<｜tool▁sep｜>' + tool['function']['name'] + '\n' + '```json' + '\n' + tool['function']['arguments'] + '\n' + '```' + '<｜tool▁call▁end｜>'}}{{'<｜tool▁calls▁end｜><｜end▁of▁sentence｜>'}}{%- endif %}{%- endfor %}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is not none %}{%- if ns.is_tool %}{{'<｜tool▁outputs▁end｜>' + message['content'] + '<｜end▁of▁sentence｜>'}}{%- set ns.is_tool = false -%}{%- else %}{% set content = message['content'] %}{% if '</think>' in content %}{% set content = content.split('</think>')[-1] %}{% endif %}{{'<｜Assistant｜>' + content + '<｜end▁of▁sentence｜>'}}{%- endif %}{%- endif %}{%- if message['role'] == 'tool' %}{%- set ns.is_tool = true -%}{%- if ns.is_output_first %}{{'<｜tool▁outputs▁begin｜><｜tool▁output▁begin｜>' + message['content'] + '<｜tool▁output▁end｜>'}}{%- set ns.is_output_first = false %}{%- else %}{{'\n<｜tool▁output▁begin｜>' + message['content'] + '<｜tool▁output▁end｜>'}}{%- endif %}{%- endif %}{%- endfor -%}{% if ns.is_tool %}{{'<｜tool▁outputs▁end｜>'}}{% endif %}{% if add_generation_prompt and not ns.is_tool %}{{'<｜Assistant｜><think>\n'}}{% endif %}
\ No newline at end of file
diff --git a/dapo_lora_7b_20251202_002719/checkpoint-64/latest b/dapo_lora_7b_20251202_002719/checkpoint-64/latest
new file mode 100644
index 0000000000000000000000000000000000000000..4a12e7f9029554e8e5ce68ebe3e97d0b4e734304
--- /dev/null
+++ b/dapo_lora_7b_20251202_002719/checkpoint-64/latest
@@ -0,0 +1 @@
+global_step64
\ No newline at end of file
diff --git a/dapo_lora_7b_20251202_002719/checkpoint-64/special_tokens_map.json b/dapo_lora_7b_20251202_002719/checkpoint-64/special_tokens_map.json
new file mode 100644
index 0000000000000000000000000000000000000000..1d385d62cf08bca35254547902b792c243656ec1
--- /dev/null
+++ b/dapo_lora_7b_20251202_002719/checkpoint-64/special_tokens_map.json
@@ -0,0 +1,23 @@
+{
+  "bos_token": {
+    "content": "<｜begin▁of▁sentence｜>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "<｜end▁of▁sentence｜>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<｜end▁of▁sentence｜>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}
diff --git a/dapo_lora_7b_20251202_002719/checkpoint-64/tokenizer_config.json b/dapo_lora_7b_20251202_002719/checkpoint-64/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..d252dd4e5764106823080946500c02a8ed8c90c9
--- /dev/null
+++ b/dapo_lora_7b_20251202_002719/checkpoint-64/tokenizer_config.json
@@ -0,0 +1,194 @@
+{
+  "add_bos_token": true,
+  "add_eos_token": false,
+  "add_prefix_space": null,
+  "added_tokens_decoder": {
+    "151643": {
+      "content": "<｜end▁of▁sentence｜>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151644": {
+      "content": "<｜User｜>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151645": {
+      "content": "<｜Assistant｜>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151646": {
+      "content": "<｜begin▁of▁sentence｜>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151647": {
+      "content": "<|EOT|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151648": {
+      "content": "<think>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151649": {
+      "content": "</think>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151650": {
+      "content": "<|quad_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151651": {
+      "content": "<|quad_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151652": {
+      "content": "<|vision_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151653": {
+      "content": "<|vision_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151654": {
+      "content": "<|vision_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151655": {
+      "content": "<|image_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151656": {
+      "content": "<|video_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151657": {
+      "content": "<tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151658": {
+      "content": "</tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151659": {
+      "content": "<|fim_prefix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151660": {
+      "content": "<|fim_middle|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151661": {
+      "content": "<|fim_suffix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151662": {
+      "content": "<|fim_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151663": {
+      "content": "<|repo_name|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151664": {
+      "content": "<|file_sep|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    }
+  },
+  "bos_token": "<｜begin▁of▁sentence｜>",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<｜end▁of▁sentence｜>",
+  "extra_special_tokens": {},
+  "legacy": true,
+  "model_max_length": 16384,
+  "pad_token": "<｜end▁of▁sentence｜>",
+  "sp_model_kwargs": {},
+  "tokenizer_class": "LlamaTokenizerFast",
+  "unk_token": null,
+  "use_default_system_prompt": false
+}
diff --git a/dapo_lora_7b_20251202_002719/checkpoint-64/trainer_state.json b/dapo_lora_7b_20251202_002719/checkpoint-64/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..a0c2e8c9eeb8bf6080bf55e7d2012e5f33fecdd9
--- /dev/null
+++ b/dapo_lora_7b_20251202_002719/checkpoint-64/trainer_state.json
@@ -0,0 +1,2018 @@
+{
+  "best_global_step": null,
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 0.029438822447102116,
+  "eval_steps": 500,
+  "global_step": 64,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 16233.0,
+      "completions/max_terminated_length": 16233.0,
+      "completions/mean_length": 5701.859375,
+      "completions/mean_terminated_length": 5701.859375,
+      "completions/min_length": 630.0,
+      "completions/min_terminated_length": 630.0,
+      "entropy": 0.35103847086429596,
+      "epoch": 0.00045998160073597056,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.0027150087989866734,
+      "learning_rate": 1e-05,
+      "loss": 0.0764,
+      "num_tokens": 372903.0,
+      "reward": 0.71875,
+      "reward_std": 0.4581822156906128,
+      "rewards/accuracy_reward/mean": 0.71875,
+      "rewards/accuracy_reward/std": 0.4531635046005249,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000617504119873,
+      "sampling/importance_sampling_ratio/min": 0.2750210464000702,
+      "sampling/sampling_logp_difference/max": 1.290907621383667,
+      "sampling/sampling_logp_difference/mean": 0.01358163170516491,
+      "step": 1
+    },
+    {
+      "clip_ratio/high_max": 0.00010992912939400412,
+      "clip_ratio/high_mean": 2.748228234850103e-05,
+      "clip_ratio/low_mean": 0.00016060493635450257,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0001880872223409824,
+      "completions/clipped_ratio": 0.09375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15933.0,
+      "completions/mean_length": 7385.90625,
+      "completions/mean_terminated_length": 6455.06884765625,
+      "completions/min_length": 393.0,
+      "completions/min_terminated_length": 393.0,
+      "entropy": 0.5675897598266602,
+      "epoch": 0.0009199632014719411,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0007440462941303849,
+      "learning_rate": 1e-05,
+      "loss": -0.0152,
+      "num_tokens": 856873.0,
+      "reward": 0.390625,
+      "reward_std": 0.2198973000049591,
+      "rewards/accuracy_reward/mean": 0.390625,
+      "rewards/accuracy_reward/std": 0.4917473793029785,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999367594718933,
+      "sampling/importance_sampling_ratio/min": 0.009396589361131191,
+      "sampling/sampling_logp_difference/max": 4.667408466339111,
+      "sampling/sampling_logp_difference/mean": 0.022290317341685295,
+      "step": 2
+    },
+    {
+      "clip_ratio/high_max": 0.00018680206630961038,
+      "clip_ratio/high_mean": 7.093910403455084e-05,
+      "clip_ratio/low_mean": 0.0002504906224203296,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.00032142972168003325,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15091.0,
+      "completions/mean_length": 5608.828125,
+      "completions/mean_terminated_length": 5437.7939453125,
+      "completions/min_length": 936.0,
+      "completions/min_terminated_length": 936.0,
+      "entropy": 0.44635456055402756,
+      "epoch": 0.0013799448022079118,
+      "frac_reward_zero_std": 0.125,
+      "grad_norm": 0.002476191846653819,
+      "learning_rate": 1e-05,
+      "loss": 0.0755,
+      "num_tokens": 1225782.0,
+      "reward": 0.578125,
+      "reward_std": 0.3776973485946655,
+      "rewards/accuracy_reward/mean": 0.578125,
+      "rewards/accuracy_reward/std": 0.49776285886764526,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999972581863403,
+      "sampling/importance_sampling_ratio/min": 0.16118201613426208,
+      "sampling/sampling_logp_difference/max": 1.825221061706543,
+      "sampling/sampling_logp_difference/mean": 0.017525848001241684,
+      "step": 3
+    },
+    {
+      "clip_ratio/high_max": 0.0002787337944027968,
+      "clip_ratio/high_mean": 8.000510115380166e-05,
+      "clip_ratio/low_mean": 0.00027736531956179533,
+      "clip_ratio/low_min": 2.338634294574149e-05,
+      "clip_ratio/region_mean": 0.0003573704316295334,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 14515.0,
+      "completions/max_terminated_length": 14515.0,
+      "completions/mean_length": 3346.078125,
+      "completions/mean_terminated_length": 3346.078125,
+      "completions/min_length": 793.0,
+      "completions/min_terminated_length": 793.0,
+      "entropy": 0.545745424926281,
+      "epoch": 0.0018399264029438822,
+      "frac_reward_zero_std": 0.125,
+      "grad_norm": 0.0037713816855102777,
+      "learning_rate": 1e-05,
+      "loss": 0.0655,
+      "num_tokens": 1453315.0,
+      "reward": 0.4375,
+      "reward_std": 0.4413174092769623,
+      "rewards/accuracy_reward/mean": 0.4375,
+      "rewards/accuracy_reward/std": 0.5,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000426769256592,
+      "sampling/importance_sampling_ratio/min": 0.08748604357242584,
+      "sampling/sampling_logp_difference/max": 2.4362759590148926,
+      "sampling/sampling_logp_difference/mean": 0.016878074035048485,
+      "step": 4
+    },
+    {
+      "clip_ratio/high_max": 0.0002736507922236342,
+      "clip_ratio/high_mean": 0.00012070279444742482,
+      "clip_ratio/low_mean": 0.00037263989906932693,
+      "clip_ratio/low_min": 7.880559132900089e-05,
+      "clip_ratio/region_mean": 0.0004933426898787729,
+      "completions/clipped_ratio": 0.203125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15992.0,
+      "completions/mean_length": 7791.578125,
+      "completions/mean_terminated_length": 5601.35302734375,
+      "completions/min_length": 788.0,
+      "completions/min_terminated_length": 788.0,
+      "entropy": 0.4527555741369724,
+      "epoch": 0.0022999080036798527,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.0019191562896594405,
+      "learning_rate": 1e-05,
+      "loss": 0.066,
+      "num_tokens": 1962144.0,
+      "reward": 0.484375,
+      "reward_std": 0.4987064301967621,
+      "rewards/accuracy_reward/mean": 0.484375,
+      "rewards/accuracy_reward/std": 0.5037065148353577,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000364780426025,
+      "sampling/importance_sampling_ratio/min": 0.09914527088403702,
+      "sampling/sampling_logp_difference/max": 2.311169147491455,
+      "sampling/sampling_logp_difference/mean": 0.019328925758600235,
+      "step": 5
+    },
+    {
+      "clip_ratio/high_max": 0.000247960046181106,
+      "clip_ratio/high_mean": 6.500758581751143e-05,
+      "clip_ratio/low_mean": 8.249791471826029e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.00014750550326425582,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15770.0,
+      "completions/mean_length": 4726.546875,
+      "completions/mean_terminated_length": 4350.5,
+      "completions/min_length": 757.0,
+      "completions/min_terminated_length": 757.0,
+      "entropy": 0.5126069597899914,
+      "epoch": 0.0027598896044158236,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.002579454565420747,
+      "learning_rate": 1e-05,
+      "loss": -0.0359,
+      "num_tokens": 2273043.0,
+      "reward": 0.484375,
+      "reward_std": 0.28930896520614624,
+      "rewards/accuracy_reward/mean": 0.484375,
+      "rewards/accuracy_reward/std": 0.5037065148353577,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999161958694458,
+      "sampling/importance_sampling_ratio/min": 0.0002888332528527826,
+      "sampling/sampling_logp_difference/max": 8.14966106414795,
+      "sampling/sampling_logp_difference/mean": 0.01803017407655716,
+      "step": 6
+    },
+    {
+      "clip_ratio/high_max": 0.00017989838943321956,
+      "clip_ratio/high_mean": 6.093144725127786e-05,
+      "clip_ratio/low_mean": 0.00028579145509866066,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0003467229043963016,
+      "completions/clipped_ratio": 0.09375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 12743.0,
+      "completions/mean_length": 7409.0625,
+      "completions/mean_terminated_length": 6480.62060546875,
+      "completions/min_length": 879.0,
+      "completions/min_terminated_length": 879.0,
+      "entropy": 0.494194608181715,
+      "epoch": 0.003219871205151794,
+      "frac_reward_zero_std": 0.125,
+      "grad_norm": 0.002430765191093087,
+      "learning_rate": 1e-05,
+      "loss": 0.0822,
+      "num_tokens": 2757655.0,
+      "reward": 0.46875,
+      "reward_std": 0.40715816617012024,
+      "rewards/accuracy_reward/mean": 0.46875,
+      "rewards/accuracy_reward/std": 0.5029674172401428,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999563694000244,
+      "sampling/importance_sampling_ratio/min": 0.17787444591522217,
+      "sampling/sampling_logp_difference/max": 1.726677417755127,
+      "sampling/sampling_logp_difference/mean": 0.019815418869256973,
+      "step": 7
+    },
+    {
+      "clip_ratio/high_max": 0.00017167176974908216,
+      "clip_ratio/high_mean": 6.041262804501457e-05,
+      "clip_ratio/low_mean": 0.0002822945152729517,
+      "clip_ratio/low_min": 5.028157829656266e-05,
+      "clip_ratio/region_mean": 0.00034270713513251394,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 13078.0,
+      "completions/mean_length": 4700.203125,
+      "completions/mean_terminated_length": 4323.30615234375,
+      "completions/min_length": 503.0,
+      "completions/min_terminated_length": 503.0,
+      "entropy": 0.39490213245153427,
+      "epoch": 0.0036798528058877645,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.0022012051194906235,
+      "learning_rate": 1e-05,
+      "loss": 0.0811,
+      "num_tokens": 3072436.0,
+      "reward": 0.609375,
+      "reward_std": 0.49446311593055725,
+      "rewards/accuracy_reward/mean": 0.609375,
+      "rewards/accuracy_reward/std": 0.4917473793029785,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998409152030945,
+      "sampling/importance_sampling_ratio/min": 0.06603337824344635,
+      "sampling/sampling_logp_difference/max": 2.717594861984253,
+      "sampling/sampling_logp_difference/mean": 0.016631681472063065,
+      "step": 8
+    },
+    {
+      "clip_ratio/high_max": 0.00013108045459375717,
+      "clip_ratio/high_mean": 4.318108904044493e-05,
+      "clip_ratio/low_mean": 0.00023819861780793872,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0002813797018461628,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15940.0,
+      "completions/mean_length": 5188.890625,
+      "completions/mean_terminated_length": 4827.7578125,
+      "completions/min_length": 790.0,
+      "completions/min_terminated_length": 790.0,
+      "entropy": 0.43566014245152473,
+      "epoch": 0.004139834406623735,
+      "frac_reward_zero_std": 0.125,
+      "grad_norm": 0.0016241734847426414,
+      "learning_rate": 1e-05,
+      "loss": 0.0119,
+      "num_tokens": 3414085.0,
+      "reward": 0.59375,
+      "reward_std": 0.39820659160614014,
+      "rewards/accuracy_reward/mean": 0.59375,
+      "rewards/accuracy_reward/std": 0.49501484632492065,
+      "sampling/importance_sampling_ratio/max": 1.9456381797790527,
+      "sampling/importance_sampling_ratio/mean": 1.0000399351119995,
+      "sampling/importance_sampling_ratio/min": 0.10360148549079895,
+      "sampling/sampling_logp_difference/max": 2.2672035694122314,
+      "sampling/sampling_logp_difference/mean": 0.01550372689962387,
+      "step": 9
+    },
+    {
+      "clip_ratio/high_max": 0.00010115922304976266,
+      "clip_ratio/high_mean": 2.5289805762440665e-05,
+      "clip_ratio/low_mean": 0.00034295484147151,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0003682446440507192,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15217.0,
+      "completions/mean_length": 5832.875,
+      "completions/mean_terminated_length": 5492.51611328125,
+      "completions/min_length": 717.0,
+      "completions/min_terminated_length": 717.0,
+      "entropy": 0.600818321108818,
+      "epoch": 0.004599816007359705,
+      "frac_reward_zero_std": 0.125,
+      "grad_norm": 0.0010776554699987173,
+      "learning_rate": 1e-05,
+      "loss": -0.0314,
+      "num_tokens": 3798397.0,
+      "reward": 0.328125,
+      "reward_std": 0.37298911809921265,
+      "rewards/accuracy_reward/mean": 0.328125,
+      "rewards/accuracy_reward/std": 0.4732423722743988,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999531507492065,
+      "sampling/importance_sampling_ratio/min": 0.0953303873538971,
+      "sampling/sampling_logp_difference/max": 2.3504066467285156,
+      "sampling/sampling_logp_difference/mean": 0.020683372393250465,
+      "step": 10
+    },
+    {
+      "clip_ratio/high_max": 0.00030824893383396557,
+      "clip_ratio/high_mean": 0.00011632417340479151,
+      "clip_ratio/low_mean": 0.0002341717704439361,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0003504959422571119,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15746.0,
+      "completions/max_terminated_length": 15746.0,
+      "completions/mean_length": 4986.171875,
+      "completions/mean_terminated_length": 4986.171875,
+      "completions/min_length": 550.0,
+      "completions/min_terminated_length": 550.0,
+      "entropy": 0.40387310832738876,
+      "epoch": 0.005059797608095676,
+      "frac_reward_zero_std": 0.125,
+      "grad_norm": 0.003584277583286166,
+      "learning_rate": 1e-05,
+      "loss": 0.0011,
+      "num_tokens": 4127424.0,
+      "reward": 0.671875,
+      "reward_std": 0.4434390664100647,
+      "rewards/accuracy_reward/mean": 0.671875,
+      "rewards/accuracy_reward/std": 0.4732423722743988,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998393654823303,
+      "sampling/importance_sampling_ratio/min": 0.02629905194044113,
+      "sampling/sampling_logp_difference/max": 3.6382224559783936,
+      "sampling/sampling_logp_difference/mean": 0.01555373053997755,
+      "step": 11
+    },
+    {
+      "clip_ratio/high_max": 0.00013135069002601085,
+      "clip_ratio/high_mean": 4.189404148746689e-05,
+      "clip_ratio/low_mean": 0.00014246321052269195,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.00018435725178278517,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 10079.0,
+      "completions/max_terminated_length": 10079.0,
+      "completions/mean_length": 3880.515625,
+      "completions/mean_terminated_length": 3880.515625,
+      "completions/min_length": 674.0,
+      "completions/min_terminated_length": 674.0,
+      "entropy": 0.4064784087240696,
+      "epoch": 0.005519779208831647,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0017852422315627337,
+      "learning_rate": 1e-05,
+      "loss": 0.0198,
+      "num_tokens": 4384473.0,
+      "reward": 0.671875,
+      "reward_std": 0.2867126166820526,
+      "rewards/accuracy_reward/mean": 0.671875,
+      "rewards/accuracy_reward/std": 0.4732423722743988,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999723434448242,
+      "sampling/importance_sampling_ratio/min": 0.37220701575279236,
+      "sampling/sampling_logp_difference/max": 0.9883050918579102,
+      "sampling/sampling_logp_difference/mean": 0.013887828215956688,
+      "step": 12
+    },
+    {
+      "clip_ratio/high_max": 0.00014981444019213086,
+      "clip_ratio/high_mean": 4.5794572770319064e-05,
+      "clip_ratio/low_mean": 0.00040218312869910733,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.00044797768418902706,
+      "completions/clipped_ratio": 0.140625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16341.0,
+      "completions/mean_length": 8894.578125,
+      "completions/mean_terminated_length": 7669.0361328125,
+      "completions/min_length": 1085.0,
+      "completions/min_terminated_length": 1085.0,
+      "entropy": 0.5499315299093723,
+      "epoch": 0.005979760809567618,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.004000168293714523,
+      "learning_rate": 1e-05,
+      "loss": 0.0373,
+      "num_tokens": 4963350.0,
+      "reward": 0.390625,
+      "reward_std": 0.2824692726135254,
+      "rewards/accuracy_reward/mean": 0.390625,
+      "rewards/accuracy_reward/std": 0.4917473793029785,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999991774559021,
+      "sampling/importance_sampling_ratio/min": 0.047493718564510345,
+      "sampling/sampling_logp_difference/max": 3.0471577644348145,
+      "sampling/sampling_logp_difference/mean": 0.02204228937625885,
+      "step": 13
+    },
+    {
+      "clip_ratio/high_max": 0.00018746273144643055,
+      "clip_ratio/high_mean": 5.583179722634668e-05,
+      "clip_ratio/low_mean": 0.0001284618601857801,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0001842936590037425,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 12058.0,
+      "completions/max_terminated_length": 12058.0,
+      "completions/mean_length": 4584.0625,
+      "completions/mean_terminated_length": 4584.0625,
+      "completions/min_length": 462.0,
+      "completions/min_terminated_length": 462.0,
+      "entropy": 0.4566480815410614,
+      "epoch": 0.006439742410303588,
+      "frac_reward_zero_std": 0.125,
+      "grad_norm": 0.003257408272475004,
+      "learning_rate": 1e-05,
+      "loss": -0.0342,
+      "num_tokens": 5266274.0,
+      "reward": 0.671875,
+      "reward_std": 0.3751009702682495,
+      "rewards/accuracy_reward/mean": 0.671875,
+      "rewards/accuracy_reward/std": 0.4732423722743988,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999750256538391,
+      "sampling/importance_sampling_ratio/min": 0.39602163434028625,
+      "sampling/sampling_logp_difference/max": 0.9262864589691162,
+      "sampling/sampling_logp_difference/mean": 0.01598881185054779,
+      "step": 14
+    },
+    {
+      "clip_ratio/high_max": 0.00015991039845175692,
+      "clip_ratio/high_mean": 5.3697508178629505e-05,
+      "clip_ratio/low_mean": 0.0003120610426776693,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.00036575855119735934,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15009.0,
+      "completions/mean_length": 5134.671875,
+      "completions/mean_terminated_length": 4581.42578125,
+      "completions/min_length": 484.0,
+      "completions/min_terminated_length": 484.0,
+      "entropy": 0.41497115045785904,
+      "epoch": 0.0068997240110395585,
+      "frac_reward_zero_std": 0.125,
+      "grad_norm": 0.004677772056311369,
+      "learning_rate": 1e-05,
+      "loss": 0.05,
+      "num_tokens": 5603925.0,
+      "reward": 0.640625,
+      "reward_std": 0.3913571238517761,
+      "rewards/accuracy_reward/mean": 0.640625,
+      "rewards/accuracy_reward/std": 0.4836103618144989,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0001789331436157,
+      "sampling/importance_sampling_ratio/min": 0.07364130765199661,
+      "sampling/sampling_logp_difference/max": 2.608549118041992,
+      "sampling/sampling_logp_difference/mean": 0.016165096312761307,
+      "step": 15
+    },
+    {
+      "clip_ratio/high_max": 0.00025949142946046777,
+      "clip_ratio/high_mean": 9.68364292930346e-05,
+      "clip_ratio/low_mean": 0.000282365266684792,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.000379201697796816,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15173.0,
+      "completions/max_terminated_length": 15173.0,
+      "completions/mean_length": 4904.96875,
+      "completions/mean_terminated_length": 4904.96875,
+      "completions/min_length": 378.0,
+      "completions/min_terminated_length": 378.0,
+      "entropy": 0.4841916747391224,
+      "epoch": 0.007359705611775529,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.002402309561148286,
+      "learning_rate": 1e-05,
+      "loss": 0.0633,
+      "num_tokens": 5928091.0,
+      "reward": 0.484375,
+      "reward_std": 0.41246524453163147,
+      "rewards/accuracy_reward/mean": 0.484375,
+      "rewards/accuracy_reward/std": 0.5037065148353577,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999504685401917,
+      "sampling/importance_sampling_ratio/min": 0.0037722671404480934,
+      "sampling/sampling_logp_difference/max": 5.580079078674316,
+      "sampling/sampling_logp_difference/mean": 0.018390391021966934,
+      "step": 16
+    },
+    {
+      "clip_ratio/high_max": 6.219606439117342e-05,
+      "clip_ratio/high_mean": 1.5549016097793356e-05,
+      "clip_ratio/low_mean": 0.00019023374534299364,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0002057827605312923,
+      "completions/clipped_ratio": 0.109375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 13837.0,
+      "completions/mean_length": 5209.84375,
+      "completions/mean_terminated_length": 3837.578857421875,
+      "completions/min_length": 126.0,
+      "completions/min_terminated_length": 126.0,
+      "entropy": 0.3513585068285465,
+      "epoch": 0.0078196872125115,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0019373978720977902,
+      "learning_rate": 1e-05,
+      "loss": 0.0016,
+      "num_tokens": 6271057.0,
+      "reward": 0.453125,
+      "reward_std": 0.3403330445289612,
+      "rewards/accuracy_reward/mean": 0.453125,
+      "rewards/accuracy_reward/std": 0.501733124256134,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999862015247345,
+      "sampling/importance_sampling_ratio/min": 0.1450539529323578,
+      "sampling/sampling_logp_difference/max": 1.9306495189666748,
+      "sampling/sampling_logp_difference/mean": 0.013681268319487572,
+      "step": 17
+    },
+    {
+      "clip_ratio/high_max": 0.0001431612308806507,
+      "clip_ratio/high_mean": 4.711323526862543e-05,
+      "clip_ratio/low_mean": 9.270217788071022e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0001398154154230724,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 9328.0,
+      "completions/max_terminated_length": 9328.0,
+      "completions/mean_length": 2520.640625,
+      "completions/mean_terminated_length": 2520.640625,
+      "completions/min_length": 416.0,
+      "completions/min_terminated_length": 416.0,
+      "entropy": 0.36302734911441803,
+      "epoch": 0.00827966881324747,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0027223003562539816,
+      "learning_rate": 1e-05,
+      "loss": -0.0416,
+      "num_tokens": 6441562.0,
+      "reward": 0.65625,
+      "reward_std": 0.33090677857398987,
+      "rewards/accuracy_reward/mean": 0.65625,
+      "rewards/accuracy_reward/std": 0.4787135720252991,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.000017762184143,
+      "sampling/importance_sampling_ratio/min": 0.3734391927719116,
+      "sampling/sampling_logp_difference/max": 0.9850001335144043,
+      "sampling/sampling_logp_difference/mean": 0.011676793918013573,
+      "step": 18
+    },
+    {
+      "clip_ratio/high_max": 0.00017718410344969016,
+      "clip_ratio/high_mean": 5.833459545101505e-05,
+      "clip_ratio/low_mean": 0.0002528423356125131,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.00031117693106352817,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15108.0,
+      "completions/mean_length": 4240.96875,
+      "completions/mean_terminated_length": 4048.222412109375,
+      "completions/min_length": 310.0,
+      "completions/min_terminated_length": 310.0,
+      "entropy": 0.3896213509142399,
+      "epoch": 0.008739650413983441,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.002503112656995654,
+      "learning_rate": 1e-05,
+      "loss": 0.0739,
+      "num_tokens": 6721568.0,
+      "reward": 0.59375,
+      "reward_std": 0.4991811513900757,
+      "rewards/accuracy_reward/mean": 0.59375,
+      "rewards/accuracy_reward/std": 0.49501484632492065,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999947547912598,
+      "sampling/importance_sampling_ratio/min": 0.10363919287919998,
+      "sampling/sampling_logp_difference/max": 2.2668397426605225,
+      "sampling/sampling_logp_difference/mean": 0.014314994215965271,
+      "step": 19
+    },
+    {
+      "clip_ratio/high_max": 0.0002049997847279883,
+      "clip_ratio/high_mean": 6.95637043008901e-05,
+      "clip_ratio/low_mean": 0.00011690972041833447,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.00018647342039912473,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15322.0,
+      "completions/mean_length": 3738.484375,
+      "completions/mean_terminated_length": 3116.573486328125,
+      "completions/min_length": 367.0,
+      "completions/min_terminated_length": 367.0,
+      "entropy": 0.29045598581433296,
+      "epoch": 0.00919963201471941,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.002947593806311488,
+      "learning_rate": 1e-05,
+      "loss": 0.0411,
+      "num_tokens": 6969399.0,
+      "reward": 0.8125,
+      "reward_std": 0.23356688022613525,
+      "rewards/accuracy_reward/mean": 0.8125,
+      "rewards/accuracy_reward/std": 0.39339789748191833,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998925924301147,
+      "sampling/importance_sampling_ratio/min": 0.11472277343273163,
+      "sampling/sampling_logp_difference/max": 2.165236711502075,
+      "sampling/sampling_logp_difference/mean": 0.011310569941997528,
+      "step": 20
+    },
+    {
+      "clip_ratio/high_max": 0.00010545731220190646,
+      "clip_ratio/high_mean": 3.014280719071394e-05,
+      "clip_ratio/low_mean": 0.00011199774735359824,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.00014214055443062534,
+      "completions/clipped_ratio": 0.078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15392.0,
+      "completions/mean_length": 6065.90625,
+      "completions/mean_terminated_length": 5191.49169921875,
+      "completions/min_length": 235.0,
+      "completions/min_terminated_length": 235.0,
+      "entropy": 0.44125597178936005,
+      "epoch": 0.009659613615455382,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0011246009962633252,
+      "learning_rate": 1e-05,
+      "loss": 0.0021,
+      "num_tokens": 7365937.0,
+      "reward": 0.421875,
+      "reward_std": 0.23144522309303284,
+      "rewards/accuracy_reward/mean": 0.421875,
+      "rewards/accuracy_reward/std": 0.49776285886764526,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000147819519043,
+      "sampling/importance_sampling_ratio/min": 0.25809481739997864,
+      "sampling/sampling_logp_difference/max": 1.3544282913208008,
+      "sampling/sampling_logp_difference/mean": 0.017348822206258774,
+      "step": 21
+    },
+    {
+      "clip_ratio/high_max": 0.0003601935495680664,
+      "clip_ratio/high_mean": 9.941099415300414e-05,
+      "clip_ratio/low_mean": 0.00034870224044425413,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0004481132409637212,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 10951.0,
+      "completions/mean_length": 3722.015625,
+      "completions/mean_terminated_length": 3521.031982421875,
+      "completions/min_length": 183.0,
+      "completions/min_terminated_length": 183.0,
+      "entropy": 0.4340820461511612,
+      "epoch": 0.010119595216191352,
+      "frac_reward_zero_std": 0.125,
+      "grad_norm": 0.001601650146767497,
+      "learning_rate": 1e-05,
+      "loss": 0.0015,
+      "num_tokens": 7615658.0,
+      "reward": 0.5,
+      "reward_std": 0.3913668990135193,
+      "rewards/accuracy_reward/mean": 0.5,
+      "rewards/accuracy_reward/std": 0.5039526224136353,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998630285263062,
+      "sampling/importance_sampling_ratio/min": 1.3064802715234691e-06,
+      "sampling/sampling_logp_difference/max": 13.548173904418945,
+      "sampling/sampling_logp_difference/mean": 0.016604293137788773,
+      "step": 22
+    },
+    {
+      "clip_ratio/high_max": 0.0002349931419303175,
+      "clip_ratio/high_mean": 6.471897268056637e-05,
+      "clip_ratio/low_mean": 0.00014105365880823229,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.00020577262966980925,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15141.0,
+      "completions/max_terminated_length": 15141.0,
+      "completions/mean_length": 3747.484375,
+      "completions/mean_terminated_length": 3747.484375,
+      "completions/min_length": 329.0,
+      "completions/min_terminated_length": 329.0,
+      "entropy": 0.43806017562747,
+      "epoch": 0.010579576816927323,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0017510901670902967,
+      "learning_rate": 1e-05,
+      "loss": -0.0391,
+      "num_tokens": 7867545.0,
+      "reward": 0.5625,
+      "reward_std": 0.22461533546447754,
+      "rewards/accuracy_reward/mean": 0.5625,
+      "rewards/accuracy_reward/std": 0.5,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000494718551636,
+      "sampling/importance_sampling_ratio/min": 0.1432838886976242,
+      "sampling/sampling_logp_difference/max": 1.942927360534668,
+      "sampling/sampling_logp_difference/mean": 0.015971330925822258,
+      "step": 23
+    },
+    {
+      "clip_ratio/high_max": 0.0002638470396050252,
+      "clip_ratio/high_mean": 8.973176045401487e-05,
+      "clip_ratio/low_mean": 0.0001654990855968208,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0002552308424128569,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15089.0,
+      "completions/mean_length": 4868.609375,
+      "completions/mean_terminated_length": 4685.82568359375,
+      "completions/min_length": 1304.0,
+      "completions/min_terminated_length": 1304.0,
+      "entropy": 0.3689058944582939,
+      "epoch": 0.011039558417663294,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0025512739084661007,
+      "learning_rate": 1e-05,
+      "loss": 0.0702,
+      "num_tokens": 8187720.0,
+      "reward": 0.625,
+      "reward_std": 0.35824596881866455,
+      "rewards/accuracy_reward/mean": 0.625,
+      "rewards/accuracy_reward/std": 0.48795005679130554,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999942779541016,
+      "sampling/importance_sampling_ratio/min": 0.21243424713611603,
+      "sampling/sampling_logp_difference/max": 1.5491228103637695,
+      "sampling/sampling_logp_difference/mean": 0.01530374214053154,
+      "step": 24
+    },
+    {
+      "clip_ratio/high_max": 0.00016221465284615988,
+      "clip_ratio/high_mean": 5.93273357480939e-05,
+      "clip_ratio/low_mean": 0.0003561860394256655,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.00041551337380951736,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16105.0,
+      "completions/mean_length": 7169.59375,
+      "completions/mean_terminated_length": 7023.33349609375,
+      "completions/min_length": 590.0,
+      "completions/min_terminated_length": 590.0,
+      "entropy": 0.5559867396950722,
+      "epoch": 0.011499540018399264,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0009040784207172692,
+      "learning_rate": 1e-05,
+      "loss": 0.0516,
+      "num_tokens": 8657286.0,
+      "reward": 0.328125,
+      "reward_std": 0.2414703518152237,
+      "rewards/accuracy_reward/mean": 0.328125,
+      "rewards/accuracy_reward/std": 0.4732423722743988,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.000077247619629,
+      "sampling/importance_sampling_ratio/min": 0.244469553232193,
+      "sampling/sampling_logp_difference/max": 1.4086644649505615,
+      "sampling/sampling_logp_difference/mean": 0.021266434341669083,
+      "step": 25
+    },
+    {
+      "clip_ratio/high_max": 0.0001577084094606107,
+      "clip_ratio/high_mean": 4.298096519050887e-05,
+      "clip_ratio/low_mean": 0.00013108373877912527,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0001740647035148868,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15311.0,
+      "completions/mean_length": 6734.921875,
+      "completions/mean_terminated_length": 6091.650390625,
+      "completions/min_length": 812.0,
+      "completions/min_terminated_length": 812.0,
+      "entropy": 0.44154683500528336,
+      "epoch": 0.011959521619135235,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.002558791544288397,
+      "learning_rate": 1e-05,
+      "loss": 0.0372,
+      "num_tokens": 9099577.0,
+      "reward": 0.515625,
+      "reward_std": 0.2777610719203949,
+      "rewards/accuracy_reward/mean": 0.515625,
+      "rewards/accuracy_reward/std": 0.5037065148353577,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999955296516418,
+      "sampling/importance_sampling_ratio/min": 0.077813521027565,
+      "sampling/sampling_logp_difference/max": 2.5534400939941406,
+      "sampling/sampling_logp_difference/mean": 0.0186590775847435,
+      "step": 26
+    },
+    {
+      "clip_ratio/high_max": 0.00014542990538757294,
+      "clip_ratio/high_mean": 3.6357476346893236e-05,
+      "clip_ratio/low_mean": 0.00021458245646499563,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.00025093993099289946,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15505.0,
+      "completions/mean_length": 4848.078125,
+      "completions/mean_terminated_length": 4475.95166015625,
+      "completions/min_length": 856.0,
+      "completions/min_terminated_length": 856.0,
+      "entropy": 0.4912428632378578,
+      "epoch": 0.012419503219871205,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0017661909805610776,
+      "learning_rate": 1e-05,
+      "loss": 0.0957,
+      "num_tokens": 9420006.0,
+      "reward": 0.515625,
+      "reward_std": 0.3403330445289612,
+      "rewards/accuracy_reward/mean": 0.515625,
+      "rewards/accuracy_reward/std": 0.5037065148353577,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.000015139579773,
+      "sampling/importance_sampling_ratio/min": 0.14381231367588043,
+      "sampling/sampling_logp_difference/max": 1.9392461776733398,
+      "sampling/sampling_logp_difference/mean": 0.017206422984600067,
+      "step": 27
+    },
+    {
+      "clip_ratio/high_max": 0.00031798147210793104,
+      "clip_ratio/high_mean": 0.00010812525488290703,
+      "clip_ratio/low_mean": 0.00021282920124576776,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.00032095445021695923,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15477.0,
+      "completions/mean_length": 5689.8125,
+      "completions/mean_terminated_length": 5163.86865234375,
+      "completions/min_length": 407.0,
+      "completions/min_terminated_length": 407.0,
+      "entropy": 0.4508574977517128,
+      "epoch": 0.012879484820607176,
+      "frac_reward_zero_std": 0.125,
+      "grad_norm": 0.0030540244188159704,
+      "learning_rate": 1e-05,
+      "loss": 0.0809,
+      "num_tokens": 9793746.0,
+      "reward": 0.53125,
+      "reward_std": 0.42552614212036133,
+      "rewards/accuracy_reward/mean": 0.53125,
+      "rewards/accuracy_reward/std": 0.5029674172401428,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999897480010986,
+      "sampling/importance_sampling_ratio/min": 8.414050967076037e-07,
+      "sampling/sampling_logp_difference/max": 13.988192558288574,
+      "sampling/sampling_logp_difference/mean": 0.016547517850995064,
+      "step": 28
+    },
+    {
+      "clip_ratio/high_max": 0.00019940425045206212,
+      "clip_ratio/high_mean": 5.6281104662048165e-05,
+      "clip_ratio/low_mean": 0.00010776506042020628,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.00016404616417275975,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 14758.0,
+      "completions/max_terminated_length": 14758.0,
+      "completions/mean_length": 3069.78125,
+      "completions/mean_terminated_length": 3069.78125,
+      "completions/min_length": 415.0,
+      "completions/min_terminated_length": 415.0,
+      "entropy": 0.39274851977825165,
+      "epoch": 0.013339466421343146,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0034625211264938116,
+      "learning_rate": 1e-05,
+      "loss": 0.0146,
+      "num_tokens": 10000348.0,
+      "reward": 0.546875,
+      "reward_std": 0.2777610421180725,
+      "rewards/accuracy_reward/mean": 0.546875,
+      "rewards/accuracy_reward/std": 0.501733124256134,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.000138282775879,
+      "sampling/importance_sampling_ratio/min": 0.32597410678863525,
+      "sampling/sampling_logp_difference/max": 1.1209373474121094,
+      "sampling/sampling_logp_difference/mean": 0.014218954369425774,
+      "step": 29
+    },
+    {
+      "clip_ratio/high_max": 0.00012761429206875619,
+      "clip_ratio/high_mean": 4.307139124648529e-05,
+      "clip_ratio/low_mean": 0.00010018590637628222,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.00014325729807751486,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16360.0,
+      "completions/mean_length": 5308.3125,
+      "completions/mean_terminated_length": 4763.6064453125,
+      "completions/min_length": 550.0,
+      "completions/min_terminated_length": 550.0,
+      "entropy": 0.50441013276577,
+      "epoch": 0.013799448022079117,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.00156789505854249,
+      "learning_rate": 1e-05,
+      "loss": 0.0046,
+      "num_tokens": 10350440.0,
+      "reward": 0.515625,
+      "reward_std": 0.2109457552433014,
+      "rewards/accuracy_reward/mean": 0.515625,
+      "rewards/accuracy_reward/std": 0.5037065148353577,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000964403152466,
+      "sampling/importance_sampling_ratio/min": 0.04705130681395531,
+      "sampling/sampling_logp_difference/max": 3.056516647338867,
+      "sampling/sampling_logp_difference/mean": 0.019430290907621384,
+      "step": 30
+    },
+    {
+      "clip_ratio/high_max": 0.00016632911138003692,
+      "clip_ratio/high_mean": 5.557040094572585e-05,
+      "clip_ratio/low_mean": 0.0002778837697405834,
+      "clip_ratio/low_min": 1.6620682799839415e-05,
+      "clip_ratio/region_mean": 0.00033345417978125624,
+      "completions/clipped_ratio": 0.09375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15490.0,
+      "completions/mean_length": 6388.265625,
+      "completions/mean_terminated_length": 5354.22412109375,
+      "completions/min_length": 722.0,
+      "completions/min_terminated_length": 722.0,
+      "entropy": 0.5342313349246979,
+      "epoch": 0.014259429622815088,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0026365246158093214,
+      "learning_rate": 1e-05,
+      "loss": 0.0118,
+      "num_tokens": 10768153.0,
+      "reward": 0.359375,
+      "reward_std": 0.31983357667922974,
+      "rewards/accuracy_reward/mean": 0.359375,
+      "rewards/accuracy_reward/std": 0.4836103618144989,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998627305030823,
+      "sampling/importance_sampling_ratio/min": 0.26772308349609375,
+      "sampling/sampling_logp_difference/max": 1.31780207157135,
+      "sampling/sampling_logp_difference/mean": 0.017920637503266335,
+      "step": 31
+    },
+    {
+      "clip_ratio/high_max": 0.00017989536627283087,
+      "clip_ratio/high_mean": 5.500852148543345e-05,
+      "clip_ratio/low_mean": 0.00012964008692506468,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.00018464860841049813,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14732.0,
+      "completions/mean_length": 5229.078125,
+      "completions/mean_terminated_length": 4869.24169921875,
+      "completions/min_length": 656.0,
+      "completions/min_terminated_length": 656.0,
+      "entropy": 0.38906631618738174,
+      "epoch": 0.014719411223551058,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0022169759031385183,
+      "learning_rate": 1e-05,
+      "loss": 0.0213,
+      "num_tokens": 11111918.0,
+      "reward": 0.765625,
+      "reward_std": 0.3629639744758606,
+      "rewards/accuracy_reward/mean": 0.765625,
+      "rewards/accuracy_reward/std": 0.42695629596710205,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999115467071533,
+      "sampling/importance_sampling_ratio/min": 0.08817384392023087,
+      "sampling/sampling_logp_difference/max": 2.4284448623657227,
+      "sampling/sampling_logp_difference/mean": 0.015222044661641121,
+      "step": 32
+    },
+    {
+      "clip_ratio/high_max": 0.00014480652316706255,
+      "clip_ratio/high_mean": 4.443957550392952e-05,
+      "clip_ratio/low_mean": 0.00012809812687919475,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.00017253770374736632,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14850.0,
+      "completions/mean_length": 5148.453125,
+      "completions/mean_terminated_length": 4786.01611328125,
+      "completions/min_length": 815.0,
+      "completions/min_terminated_length": 815.0,
+      "entropy": 0.5083456933498383,
+      "epoch": 0.01517939282428703,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.003128955839201808,
+      "learning_rate": 1e-05,
+      "loss": -0.0622,
+      "num_tokens": 11451323.0,
+      "reward": 0.53125,
+      "reward_std": 0.34034284949302673,
+      "rewards/accuracy_reward/mean": 0.53125,
+      "rewards/accuracy_reward/std": 0.5029674172401428,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.000025987625122,
+      "sampling/importance_sampling_ratio/min": 0.10359863191843033,
+      "sampling/sampling_logp_difference/max": 2.2672312259674072,
+      "sampling/sampling_logp_difference/mean": 0.017722681164741516,
+      "step": 33
+    },
+    {
+      "clip_ratio/high_max": 5.51352559341467e-05,
+      "clip_ratio/high_mean": 1.3783813983536675e-05,
+      "clip_ratio/low_mean": 7.914142133813584e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 9.292523554904619e-05,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 13048.0,
+      "completions/mean_length": 4609.5,
+      "completions/mean_terminated_length": 3824.533447265625,
+      "completions/min_length": 829.0,
+      "completions/min_terminated_length": 829.0,
+      "entropy": 0.49830054119229317,
+      "epoch": 0.015639374425023,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0007577431970275939,
+      "learning_rate": 1e-05,
+      "loss": 0.0132,
+      "num_tokens": 11758275.0,
+      "reward": 0.375,
+      "reward_std": 0.2041158676147461,
+      "rewards/accuracy_reward/mean": 0.375,
+      "rewards/accuracy_reward/std": 0.48795005679130554,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998879432678223,
+      "sampling/importance_sampling_ratio/min": 0.05370701104402542,
+      "sampling/sampling_logp_difference/max": 2.9242117404937744,
+      "sampling/sampling_logp_difference/mean": 0.01685405895113945,
+      "step": 34
+    },
+    {
+      "clip_ratio/high_max": 0.0001986039806070039,
+      "clip_ratio/high_mean": 6.727558275088086e-05,
+      "clip_ratio/low_mean": 0.0003367365798112587,
+      "clip_ratio/low_min": 6.28791003691731e-05,
+      "clip_ratio/region_mean": 0.000404012165745371,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14497.0,
+      "completions/mean_length": 4593.015625,
+      "completions/mean_terminated_length": 4013.130859375,
+      "completions/min_length": 1094.0,
+      "completions/min_terminated_length": 1094.0,
+      "entropy": 0.3128826189786196,
+      "epoch": 0.01609935602575897,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.0026802816428244114,
+      "learning_rate": 1e-05,
+      "loss": 0.1212,
+      "num_tokens": 12063516.0,
+      "reward": 0.625,
+      "reward_std": 0.49234145879745483,
+      "rewards/accuracy_reward/mean": 0.625,
+      "rewards/accuracy_reward/std": 0.48795005679130554,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999848008155823,
+      "sampling/importance_sampling_ratio/min": 0.0008915311773307621,
+      "sampling/sampling_logp_difference/max": 7.0225701332092285,
+      "sampling/sampling_logp_difference/mean": 0.01317686028778553,
+      "step": 35
+    },
+    {
+      "clip_ratio/high_max": 7.243978234328097e-05,
+      "clip_ratio/high_mean": 1.8109945585820242e-05,
+      "clip_ratio/low_mean": 9.390242212248268e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.00011201236907254497,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16161.0,
+      "completions/mean_length": 5015.171875,
+      "completions/mean_terminated_length": 4456.048828125,
+      "completions/min_length": 666.0,
+      "completions/min_terminated_length": 666.0,
+      "entropy": 0.37973257526755333,
+      "epoch": 0.01655933762649494,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.002345556626096368,
+      "learning_rate": 1e-05,
+      "loss": -0.0941,
+      "num_tokens": 12393103.0,
+      "reward": 0.640625,
+      "reward_std": 0.2688094973564148,
+      "rewards/accuracy_reward/mean": 0.640625,
+      "rewards/accuracy_reward/std": 0.4836103618144989,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000877380371094,
+      "sampling/importance_sampling_ratio/min": 0.1842055469751358,
+      "sampling/sampling_logp_difference/max": 1.6917030811309814,
+      "sampling/sampling_logp_difference/mean": 0.0145792867988348,
+      "step": 36
+    },
+    {
+      "clip_ratio/high_max": 0.00014789494525757618,
+      "clip_ratio/high_mean": 4.601037198881386e-05,
+      "clip_ratio/low_mean": 0.0003090670288656838,
+      "clip_ratio/low_min": 1.8808304957929067e-05,
+      "clip_ratio/region_mean": 0.00035507740903995,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15632.0,
+      "completions/mean_length": 5598.484375,
+      "completions/mean_terminated_length": 5068.048828125,
+      "completions/min_length": 1283.0,
+      "completions/min_terminated_length": 1283.0,
+      "entropy": 0.35928424820303917,
+      "epoch": 0.01701931922723091,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.0015618539182469249,
+      "learning_rate": 1e-05,
+      "loss": 0.0506,
+      "num_tokens": 12761230.0,
+      "reward": 0.546875,
+      "reward_std": 0.4240131676197052,
+      "rewards/accuracy_reward/mean": 0.546875,
+      "rewards/accuracy_reward/std": 0.501733124256134,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999818801879883,
+      "sampling/importance_sampling_ratio/min": 0.2261282205581665,
+      "sampling/sampling_logp_difference/max": 2.6031017303466797,
+      "sampling/sampling_logp_difference/mean": 0.01447785273194313,
+      "step": 37
+    },
+    {
+      "clip_ratio/high_max": 4.21205932070734e-05,
+      "clip_ratio/high_mean": 1.053014830176835e-05,
+      "clip_ratio/low_mean": 4.961071590514621e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 6.014086420691456e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16037.0,
+      "completions/mean_length": 5366.125,
+      "completions/mean_terminated_length": 4824.26220703125,
+      "completions/min_length": 324.0,
+      "completions/min_terminated_length": 324.0,
+      "entropy": 0.41980869323015213,
+      "epoch": 0.017479300827966882,
+      "frac_reward_zero_std": 0.625,
+      "grad_norm": 0.0011855819029733539,
+      "learning_rate": 1e-05,
+      "loss": 0.0588,
+      "num_tokens": 13115038.0,
+      "reward": 0.5,
+      "reward_std": 0.17570312321186066,
+      "rewards/accuracy_reward/mean": 0.5,
+      "rewards/accuracy_reward/std": 0.5039526224136353,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999375343322754,
+      "sampling/importance_sampling_ratio/min": 0.15887950360774994,
+      "sampling/sampling_logp_difference/max": 1.839609146118164,
+      "sampling/sampling_logp_difference/mean": 0.015550841577351093,
+      "step": 38
+    },
+    {
+      "clip_ratio/high_max": 0.0003506070097500924,
+      "clip_ratio/high_mean": 0.00010976320845657028,
+      "clip_ratio/low_mean": 0.0001256909990843269,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.00023545420481241308,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15317.0,
+      "completions/max_terminated_length": 15317.0,
+      "completions/mean_length": 3308.296875,
+      "completions/mean_terminated_length": 3308.296875,
+      "completions/min_length": 786.0,
+      "completions/min_terminated_length": 786.0,
+      "entropy": 0.38983067497611046,
+      "epoch": 0.017939282428702852,
+      "frac_reward_zero_std": 0.125,
+      "grad_norm": 0.0023375866003334522,
+      "learning_rate": 1e-05,
+      "loss": 0.0624,
+      "num_tokens": 13335329.0,
+      "reward": 0.59375,
+      "reward_std": 0.3913668990135193,
+      "rewards/accuracy_reward/mean": 0.59375,
+      "rewards/accuracy_reward/std": 0.49501484632492065,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.99998939037323,
+      "sampling/importance_sampling_ratio/min": 0.0030945157632231712,
+      "sampling/sampling_logp_difference/max": 5.77812385559082,
+      "sampling/sampling_logp_difference/mean": 0.013900299556553364,
+      "step": 39
+    },
+    {
+      "clip_ratio/high_max": 0.000169710889167618,
+      "clip_ratio/high_mean": 5.673388113791589e-05,
+      "clip_ratio/low_mean": 0.00029868036835978273,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.000355414251316688,
+      "completions/clipped_ratio": 0.078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15761.0,
+      "completions/mean_length": 5426.078125,
+      "completions/mean_terminated_length": 4497.44091796875,
+      "completions/min_length": 855.0,
+      "completions/min_terminated_length": 855.0,
+      "entropy": 0.43789565935730934,
+      "epoch": 0.01839926402943882,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.0025193989276885986,
+      "learning_rate": 1e-05,
+      "loss": 0.0349,
+      "num_tokens": 13691110.0,
+      "reward": 0.5,
+      "reward_std": 0.45134252309799194,
+      "rewards/accuracy_reward/mean": 0.5,
+      "rewards/accuracy_reward/std": 0.5039526224136353,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.000011682510376,
+      "sampling/importance_sampling_ratio/min": 0.14047929644584656,
+      "sampling/sampling_logp_difference/max": 1.9626951217651367,
+      "sampling/sampling_logp_difference/mean": 0.015961986035108566,
+      "step": 40
+    },
+    {
+      "clip_ratio/high_max": 8.76178437465569e-05,
+      "clip_ratio/high_mean": 2.3123878236219753e-05,
+      "clip_ratio/low_mean": 0.00019285815869807266,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0002159820378437871,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16374.0,
+      "completions/mean_length": 4766.140625,
+      "completions/mean_terminated_length": 4194.77001953125,
+      "completions/min_length": 516.0,
+      "completions/min_terminated_length": 516.0,
+      "entropy": 0.47973789647221565,
+      "epoch": 0.018859245630174794,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0005962434806860983,
+      "learning_rate": 1e-05,
+      "loss": 0.0018,
+      "num_tokens": 14006911.0,
+      "reward": 0.484375,
+      "reward_std": 0.2382849156856537,
+      "rewards/accuracy_reward/mean": 0.484375,
+      "rewards/accuracy_reward/std": 0.5037065148353577,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000395774841309,
+      "sampling/importance_sampling_ratio/min": 0.12198832631111145,
+      "sampling/sampling_logp_difference/max": 2.103829860687256,
+      "sampling/sampling_logp_difference/mean": 0.016915298998355865,
+      "step": 41
+    },
+    {
+      "clip_ratio/high_max": 6.694088551739696e-05,
+      "clip_ratio/high_mean": 2.3428712665918283e-05,
+      "clip_ratio/low_mean": 0.0002706102432057378,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0002940389586001402,
+      "completions/clipped_ratio": 0.109375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14691.0,
+      "completions/mean_length": 5922.421875,
+      "completions/mean_terminated_length": 4637.66650390625,
+      "completions/min_length": 772.0,
+      "completions/min_terminated_length": 772.0,
+      "entropy": 0.42647283896803856,
+      "epoch": 0.019319227230910764,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.001872243476100266,
+      "learning_rate": 1e-05,
+      "loss": 0.0244,
+      "num_tokens": 14394946.0,
+      "reward": 0.4375,
+      "reward_std": 0.36295416951179504,
+      "rewards/accuracy_reward/mean": 0.4375,
+      "rewards/accuracy_reward/std": 0.5,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999639987945557,
+      "sampling/importance_sampling_ratio/min": 0.293357253074646,
+      "sampling/sampling_logp_difference/max": 2.1049091815948486,
+      "sampling/sampling_logp_difference/mean": 0.01656758040189743,
+      "step": 42
+    },
+    {
+      "clip_ratio/high_max": 0.00015323197931138566,
+      "clip_ratio/high_mean": 4.9833591447168146e-05,
+      "clip_ratio/low_mean": 0.00034982425768248504,
+      "clip_ratio/low_min": 1.088660519599216e-05,
+      "clip_ratio/region_mean": 0.0003996578489022795,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16255.0,
+      "completions/mean_length": 6493.1875,
+      "completions/mean_terminated_length": 6006.75390625,
+      "completions/min_length": 479.0,
+      "completions/min_terminated_length": 479.0,
+      "entropy": 0.4782983772456646,
+      "epoch": 0.019779208831646734,
+      "frac_reward_zero_std": 0.125,
+      "grad_norm": 0.00166318379342556,
+      "learning_rate": 1e-05,
+      "loss": 0.0511,
+      "num_tokens": 14821182.0,
+      "reward": 0.46875,
+      "reward_std": 0.4092700183391571,
+      "rewards/accuracy_reward/mean": 0.46875,
+      "rewards/accuracy_reward/std": 0.5029674172401428,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998992085456848,
+      "sampling/importance_sampling_ratio/min": 1.7716387219479657e-06,
+      "sampling/sampling_logp_difference/max": 13.243605613708496,
+      "sampling/sampling_logp_difference/mean": 0.018610000610351562,
+      "step": 43
+    },
+    {
+      "clip_ratio/high_max": 6.034070747773512e-05,
+      "clip_ratio/high_mean": 1.6863068026395922e-05,
+      "clip_ratio/low_mean": 9.460987712373026e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.00011147294480906567,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16269.0,
+      "completions/mean_length": 4648.546875,
+      "completions/mean_terminated_length": 4269.98388671875,
+      "completions/min_length": 665.0,
+      "completions/min_terminated_length": 665.0,
+      "entropy": 0.4597437307238579,
+      "epoch": 0.020239190432382703,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0008557081455364823,
+      "learning_rate": 1e-05,
+      "loss": 0.069,
+      "num_tokens": 15128561.0,
+      "reward": 0.328125,
+      "reward_std": 0.23144522309303284,
+      "rewards/accuracy_reward/mean": 0.328125,
+      "rewards/accuracy_reward/std": 0.4732423722743988,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999915957450867,
+      "sampling/importance_sampling_ratio/min": 0.2670474946498871,
+      "sampling/sampling_logp_difference/max": 1.320328712463379,
+      "sampling/sampling_logp_difference/mean": 0.016183078289031982,
+      "step": 44
+    },
+    {
+      "clip_ratio/high_max": 0.00016895902081159875,
+      "clip_ratio/high_mean": 6.0399999711080454e-05,
+      "clip_ratio/low_mean": 0.0002296717866556719,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.00029007178636675235,
+      "completions/clipped_ratio": 0.078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15720.0,
+      "completions/mean_length": 6930.234375,
+      "completions/mean_terminated_length": 6129.06787109375,
+      "completions/min_length": 682.0,
+      "completions/min_terminated_length": 682.0,
+      "entropy": 0.5115556567907333,
+      "epoch": 0.020699172033118676,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0016648141900077462,
+      "learning_rate": 1e-05,
+      "loss": 0.0232,
+      "num_tokens": 15582168.0,
+      "reward": 0.5625,
+      "reward_std": 0.3424547016620636,
+      "rewards/accuracy_reward/mean": 0.5625,
+      "rewards/accuracy_reward/std": 0.5,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000483989715576,
+      "sampling/importance_sampling_ratio/min": 0.187262162566185,
+      "sampling/sampling_logp_difference/max": 1.937586784362793,
+      "sampling/sampling_logp_difference/mean": 0.019788919016718864,
+      "step": 45
+    },
+    {
+      "clip_ratio/high_max": 9.100124134420184e-05,
+      "clip_ratio/high_mean": 3.351398640916159e-05,
+      "clip_ratio/low_mean": 0.000253890422754921,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0002874044093914563,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16321.0,
+      "completions/mean_length": 6264.671875,
+      "completions/mean_terminated_length": 5938.24169921875,
+      "completions/min_length": 227.0,
+      "completions/min_terminated_length": 227.0,
+      "entropy": 0.43167873099446297,
+      "epoch": 0.021159153633854646,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0013617790536955,
+      "learning_rate": 1e-05,
+      "loss": 0.0032,
+      "num_tokens": 15994715.0,
+      "reward": 0.640625,
+      "reward_std": 0.3766237497329712,
+      "rewards/accuracy_reward/mean": 0.640625,
+      "rewards/accuracy_reward/std": 0.4836103618144989,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999179840087891,
+      "sampling/importance_sampling_ratio/min": 0.1620832085609436,
+      "sampling/sampling_logp_difference/max": 1.8196454048156738,
+      "sampling/sampling_logp_difference/mean": 0.017889156937599182,
+      "step": 46
+    },
+    {
+      "clip_ratio/high_max": 6.15748222116963e-05,
+      "clip_ratio/high_mean": 1.870576988949324e-05,
+      "clip_ratio/low_mean": 0.0003191337254975224,
+      "clip_ratio/low_min": 4.877414176007733e-05,
+      "clip_ratio/region_mean": 0.0003378394994797418,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 12838.0,
+      "completions/mean_length": 4168.140625,
+      "completions/mean_terminated_length": 3974.23828125,
+      "completions/min_length": 705.0,
+      "completions/min_terminated_length": 705.0,
+      "entropy": 0.433504331856966,
+      "epoch": 0.021619135234590615,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.003133355872705579,
+      "learning_rate": 1e-05,
+      "loss": 0.0478,
+      "num_tokens": 16272044.0,
+      "reward": 0.34375,
+      "reward_std": 0.3377465009689331,
+      "rewards/accuracy_reward/mean": 0.34375,
+      "rewards/accuracy_reward/std": 0.4787135720252991,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998913407325745,
+      "sampling/importance_sampling_ratio/min": 0.38697248697280884,
+      "sampling/sampling_logp_difference/max": 1.4266910552978516,
+      "sampling/sampling_logp_difference/mean": 0.014272443950176239,
+      "step": 47
+    },
+    {
+      "clip_ratio/high_max": 5.0198698772874195e-05,
+      "clip_ratio/high_mean": 1.2549674693218549e-05,
+      "clip_ratio/low_mean": 0.00024944932374637574,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0002619989991217153,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14382.0,
+      "completions/mean_length": 5228.15625,
+      "completions/mean_terminated_length": 4868.2900390625,
+      "completions/min_length": 1099.0,
+      "completions/min_terminated_length": 1099.0,
+      "entropy": 0.6134471148252487,
+      "epoch": 0.02207911683532659,
+      "frac_reward_zero_std": 0.125,
+      "grad_norm": 0.002945883432403207,
+      "learning_rate": 1e-05,
+      "loss": 0.0237,
+      "num_tokens": 16616510.0,
+      "reward": 0.453125,
+      "reward_std": 0.39560043811798096,
+      "rewards/accuracy_reward/mean": 0.453125,
+      "rewards/accuracy_reward/std": 0.501733124256134,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.000022053718567,
+      "sampling/importance_sampling_ratio/min": 0.23671367764472961,
+      "sampling/sampling_logp_difference/max": 1.4409040212631226,
+      "sampling/sampling_logp_difference/mean": 0.01892893575131893,
+      "step": 48
+    },
+    {
+      "clip_ratio/high_max": 0.00010992094757966697,
+      "clip_ratio/high_mean": 3.773104890569812e-05,
+      "clip_ratio/low_mean": 0.0002085948569856555,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0002463259042997379,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15580.0,
+      "completions/max_terminated_length": 15580.0,
+      "completions/mean_length": 4286.90625,
+      "completions/mean_terminated_length": 4286.90625,
+      "completions/min_length": 430.0,
+      "completions/min_terminated_length": 430.0,
+      "entropy": 0.3194341119378805,
+      "epoch": 0.022539098436062558,
+      "frac_reward_zero_std": 0.125,
+      "grad_norm": 0.0033129912335425615,
+      "learning_rate": 1e-05,
+      "loss": -0.0135,
+      "num_tokens": 16903128.0,
+      "reward": 0.578125,
+      "reward_std": 0.4113916754722595,
+      "rewards/accuracy_reward/mean": 0.578125,
+      "rewards/accuracy_reward/std": 0.49776285886764526,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000847578048706,
+      "sampling/importance_sampling_ratio/min": 0.14042755961418152,
+      "sampling/sampling_logp_difference/max": 1.9630634784698486,
+      "sampling/sampling_logp_difference/mean": 0.0129241943359375,
+      "step": 49
+    },
+    {
+      "clip_ratio/high_max": 0.00010812897107825847,
+      "clip_ratio/high_mean": 3.162783127663715e-05,
+      "clip_ratio/low_mean": 0.0001828691292757867,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.00021449696214403957,
+      "completions/clipped_ratio": 0.078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15964.0,
+      "completions/mean_length": 5032.125,
+      "completions/mean_terminated_length": 4070.101806640625,
+      "completions/min_length": 441.0,
+      "completions/min_terminated_length": 441.0,
+      "entropy": 0.4777919165790081,
+      "epoch": 0.022999080036798528,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0021068111527711153,
+      "learning_rate": 1e-05,
+      "loss": -0.0866,
+      "num_tokens": 17236504.0,
+      "reward": 0.515625,
+      "reward_std": 0.29826053977012634,
+      "rewards/accuracy_reward/mean": 0.515625,
+      "rewards/accuracy_reward/std": 0.5037065148353577,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000327825546265,
+      "sampling/importance_sampling_ratio/min": 0.2832590341567993,
+      "sampling/sampling_logp_difference/max": 1.8220746517181396,
+      "sampling/sampling_logp_difference/mean": 0.01738543063402176,
+      "step": 50
+    },
+    {
+      "clip_ratio/high_max": 0.00012820017036574427,
+      "clip_ratio/high_mean": 3.647331323008984e-05,
+      "clip_ratio/low_mean": 0.00025561100665072445,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0002920843198808143,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 13201.0,
+      "completions/mean_length": 4803.203125,
+      "completions/mean_terminated_length": 4619.38134765625,
+      "completions/min_length": 530.0,
+      "completions/min_terminated_length": 530.0,
+      "entropy": 0.4494751952588558,
+      "epoch": 0.023459061637534497,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0028032760601490736,
+      "learning_rate": 1e-05,
+      "loss": 0.0433,
+      "num_tokens": 17553269.0,
+      "reward": 0.609375,
+      "reward_std": 0.3403330445289612,
+      "rewards/accuracy_reward/mean": 0.609375,
+      "rewards/accuracy_reward/std": 0.4917473793029785,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999998152256012,
+      "sampling/importance_sampling_ratio/min": 0.21100811660289764,
+      "sampling/sampling_logp_difference/max": 1.5558586120605469,
+      "sampling/sampling_logp_difference/mean": 0.01737060397863388,
+      "step": 51
+    },
+    {
+      "clip_ratio/high_max": 0.00010267168681821204,
+      "clip_ratio/high_mean": 3.3487939049337e-05,
+      "clip_ratio/low_mean": 0.00015384274320240365,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.00018733068225174065,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16160.0,
+      "completions/mean_length": 7100.3125,
+      "completions/mean_terminated_length": 6643.7373046875,
+      "completions/min_length": 1183.0,
+      "completions/min_terminated_length": 1183.0,
+      "entropy": 0.5009776279330254,
+      "epoch": 0.02391904323827047,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.001591994776390493,
+      "learning_rate": 1e-05,
+      "loss": -0.0421,
+      "num_tokens": 18016729.0,
+      "reward": 0.453125,
+      "reward_std": 0.28930896520614624,
+      "rewards/accuracy_reward/mean": 0.453125,
+      "rewards/accuracy_reward/std": 0.501733124256134,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000343322753906,
+      "sampling/importance_sampling_ratio/min": 0.09941783547401428,
+      "sampling/sampling_logp_difference/max": 2.3084237575531006,
+      "sampling/sampling_logp_difference/mean": 0.01882891170680523,
+      "step": 52
+    },
+    {
+      "clip_ratio/high_max": 0.00016665930297676823,
+      "clip_ratio/high_mean": 5.2525359819810546e-05,
+      "clip_ratio/low_mean": 0.0004211304803902749,
+      "clip_ratio/low_min": 9.529018279863521e-05,
+      "clip_ratio/region_mean": 0.0004736558298645832,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14227.0,
+      "completions/mean_length": 6233.796875,
+      "completions/mean_terminated_length": 5557.1171875,
+      "completions/min_length": 1338.0,
+      "completions/min_terminated_length": 1338.0,
+      "entropy": 0.48881014063954353,
+      "epoch": 0.02437902483900644,
+      "frac_reward_zero_std": 0.125,
+      "grad_norm": 0.003694011364132166,
+      "learning_rate": 1e-05,
+      "loss": 0.1627,
+      "num_tokens": 18426140.0,
+      "reward": 0.625,
+      "reward_std": 0.3977220952510834,
+      "rewards/accuracy_reward/mean": 0.625,
+      "rewards/accuracy_reward/std": 0.48795005679130554,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000104904174805,
+      "sampling/importance_sampling_ratio/min": 0.20072485506534576,
+      "sampling/sampling_logp_difference/max": 1.6058201789855957,
+      "sampling/sampling_logp_difference/mean": 0.01879170536994934,
+      "step": 53
+    },
+    {
+      "clip_ratio/high_max": 0.00012100895446565119,
+      "clip_ratio/high_mean": 4.9377299660591234e-05,
+      "clip_ratio/low_mean": 0.00019421957949816715,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.00024359687631658744,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15854.0,
+      "completions/mean_length": 5629.03125,
+      "completions/mean_terminated_length": 5282.0966796875,
+      "completions/min_length": 177.0,
+      "completions/min_terminated_length": 177.0,
+      "entropy": 0.3631018362939358,
+      "epoch": 0.02483900643974241,
+      "frac_reward_zero_std": 0.125,
+      "grad_norm": 0.001484633656218648,
+      "learning_rate": 1e-05,
+      "loss": 0.0571,
+      "num_tokens": 18794958.0,
+      "reward": 0.609375,
+      "reward_std": 0.4050365090370178,
+      "rewards/accuracy_reward/mean": 0.609375,
+      "rewards/accuracy_reward/std": 0.4917473793029785,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.000042200088501,
+      "sampling/importance_sampling_ratio/min": 0.002677773591130972,
+      "sampling/sampling_logp_difference/max": 5.922769546508789,
+      "sampling/sampling_logp_difference/mean": 0.013976464979350567,
+      "step": 54
+    },
+    {
+      "clip_ratio/high_max": 0.00021361040307965595,
+      "clip_ratio/high_mean": 8.756921079111635e-05,
+      "clip_ratio/low_mean": 0.0002042179089585261,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.00029178711429267423,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16115.0,
+      "completions/mean_length": 5366.453125,
+      "completions/mean_terminated_length": 5191.57177734375,
+      "completions/min_length": 593.0,
+      "completions/min_terminated_length": 593.0,
+      "entropy": 0.34573371335864067,
+      "epoch": 0.025298988040478382,
+      "frac_reward_zero_std": 0.125,
+      "grad_norm": 0.0018017840338870883,
+      "learning_rate": 1e-05,
+      "loss": -0.0307,
+      "num_tokens": 19148275.0,
+      "reward": 0.734375,
+      "reward_std": 0.4050365090370178,
+      "rewards/accuracy_reward/mean": 0.734375,
+      "rewards/accuracy_reward/std": 0.44515693187713623,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999160766601562,
+      "sampling/importance_sampling_ratio/min": 0.22769968211650848,
+      "sampling/sampling_logp_difference/max": 1.4797277450561523,
+      "sampling/sampling_logp_difference/mean": 0.014456957578659058,
+      "step": 55
+    },
+    {
+      "clip_ratio/high_max": 0.00020042336745973444,
+      "clip_ratio/high_mean": 5.850923639627581e-05,
+      "clip_ratio/low_mean": 0.00019344742031535134,
+      "clip_ratio/low_min": 1.594387686054688e-05,
+      "clip_ratio/region_mean": 0.0002519566587579902,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15942.0,
+      "completions/mean_length": 5801.921875,
+      "completions/mean_terminated_length": 5460.564453125,
+      "completions/min_length": 538.0,
+      "completions/min_terminated_length": 538.0,
+      "entropy": 0.4420101195573807,
+      "epoch": 0.025758969641214352,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0007390208193100989,
+      "learning_rate": 1e-05,
+      "loss": 0.0368,
+      "num_tokens": 19530718.0,
+      "reward": 0.421875,
+      "reward_std": 0.2993341088294983,
+      "rewards/accuracy_reward/mean": 0.421875,
+      "rewards/accuracy_reward/std": 0.49776285886764526,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999010562896729,
+      "sampling/importance_sampling_ratio/min": 0.04691341519355774,
+      "sampling/sampling_logp_difference/max": 3.0594515800476074,
+      "sampling/sampling_logp_difference/mean": 0.016371876001358032,
+      "step": 56
+    },
+    {
+      "clip_ratio/high_max": 0.0001929260479300865,
+      "clip_ratio/high_mean": 7.267188334481034e-05,
+      "clip_ratio/low_mean": 0.00013643273086927366,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.00020910461648782075,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15886.0,
+      "completions/max_terminated_length": 15886.0,
+      "completions/mean_length": 3581.09375,
+      "completions/mean_terminated_length": 3581.09375,
+      "completions/min_length": 615.0,
+      "completions/min_terminated_length": 615.0,
+      "entropy": 0.36750902235507965,
+      "epoch": 0.02621895124195032,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0020201546140015125,
+      "learning_rate": 1e-05,
+      "loss": 0.1245,
+      "num_tokens": 19771076.0,
+      "reward": 0.578125,
+      "reward_std": 0.2688094973564148,
+      "rewards/accuracy_reward/mean": 0.578125,
+      "rewards/accuracy_reward/std": 0.49776285886764526,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.000082015991211,
+      "sampling/importance_sampling_ratio/min": 0.21508392691612244,
+      "sampling/sampling_logp_difference/max": 2.204270362854004,
+      "sampling/sampling_logp_difference/mean": 0.013558689504861832,
+      "step": 57
+    },
+    {
+      "clip_ratio/high_max": 0.00019395453546167118,
+      "clip_ratio/high_mean": 6.426821187233145e-05,
+      "clip_ratio/low_mean": 0.00017469121939939214,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.00023895943377283402,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 14150.0,
+      "completions/max_terminated_length": 14150.0,
+      "completions/mean_length": 4180.46875,
+      "completions/mean_terminated_length": 4180.46875,
+      "completions/min_length": 251.0,
+      "completions/min_terminated_length": 251.0,
+      "entropy": 0.4649594761431217,
+      "epoch": 0.02667893284268629,
+      "frac_reward_zero_std": 0.125,
+      "grad_norm": 0.0028552189469337463,
+      "learning_rate": 1e-05,
+      "loss": 0.0924,
+      "num_tokens": 20048138.0,
+      "reward": 0.53125,
+      "reward_std": 0.4276576042175293,
+      "rewards/accuracy_reward/mean": 0.53125,
+      "rewards/accuracy_reward/std": 0.5029674172401428,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000083446502686,
+      "sampling/importance_sampling_ratio/min": 0.2393883913755417,
+      "sampling/sampling_logp_difference/max": 1.4296680688858032,
+      "sampling/sampling_logp_difference/mean": 0.017490293830633163,
+      "step": 58
+    },
+    {
+      "clip_ratio/high_max": 0.00014915554584149504,
+      "clip_ratio/high_mean": 3.9898490058476455e-05,
+      "clip_ratio/low_mean": 5.383538700698409e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 9.373387524647114e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15323.0,
+      "completions/max_terminated_length": 15323.0,
+      "completions/mean_length": 4642.15625,
+      "completions/mean_terminated_length": 4642.15625,
+      "completions/min_length": 403.0,
+      "completions/min_terminated_length": 403.0,
+      "entropy": 0.41386983543634415,
+      "epoch": 0.027138914443422264,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0014837872004136443,
+      "learning_rate": 1e-05,
+      "loss": -0.0232,
+      "num_tokens": 20355020.0,
+      "reward": 0.65625,
+      "reward_std": 0.3198433816432953,
+      "rewards/accuracy_reward/mean": 0.65625,
+      "rewards/accuracy_reward/std": 0.4787135720252991,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0001411437988281,
+      "sampling/importance_sampling_ratio/min": 0.022514859214425087,
+      "sampling/sampling_logp_difference/max": 3.7935798168182373,
+      "sampling/sampling_logp_difference/mean": 0.015344480983912945,
+      "step": 59
+    },
+    {
+      "clip_ratio/high_max": 7.379077214864083e-05,
+      "clip_ratio/high_mean": 2.223373576271115e-05,
+      "clip_ratio/low_mean": 0.00013174474815969006,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0001539784839224012,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15170.0,
+      "completions/max_terminated_length": 15170.0,
+      "completions/mean_length": 3369.015625,
+      "completions/mean_terminated_length": 3369.015625,
+      "completions/min_length": 267.0,
+      "completions/min_terminated_length": 267.0,
+      "entropy": 0.46293293312191963,
+      "epoch": 0.027598896044158234,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0023857210762798786,
+      "learning_rate": 1e-05,
+      "loss": 0.0587,
+      "num_tokens": 20579309.0,
+      "reward": 0.40625,
+      "reward_std": 0.29143065214157104,
+      "rewards/accuracy_reward/mean": 0.40625,
+      "rewards/accuracy_reward/std": 0.49501484632492065,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999541640281677,
+      "sampling/importance_sampling_ratio/min": 0.00012647465337067842,
+      "sampling/sampling_logp_difference/max": 8.975468635559082,
+      "sampling/sampling_logp_difference/mean": 0.016323832795023918,
+      "step": 60
+    },
+    {
+      "clip_ratio/high_max": 0.00010131701310456265,
+      "clip_ratio/high_mean": 3.068578371312469e-05,
+      "clip_ratio/low_mean": 0.00017564234258315992,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0002063281253867899,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15267.0,
+      "completions/mean_length": 4186.265625,
+      "completions/mean_terminated_length": 3992.651123046875,
+      "completions/min_length": 636.0,
+      "completions/min_terminated_length": 636.0,
+      "entropy": 0.4424850195646286,
+      "epoch": 0.028058877644894203,
+      "frac_reward_zero_std": 0.125,
+      "grad_norm": 0.001888959901407361,
+      "learning_rate": 1e-05,
+      "loss": -0.0867,
+      "num_tokens": 20858230.0,
+      "reward": 0.5,
+      "reward_std": 0.43401283025741577,
+      "rewards/accuracy_reward/mean": 0.5,
+      "rewards/accuracy_reward/std": 0.5039526224136353,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0001115798950195,
+      "sampling/importance_sampling_ratio/min": 0.21523967385292053,
+      "sampling/sampling_logp_difference/max": 1.5360031127929688,
+      "sampling/sampling_logp_difference/mean": 0.015638090670108795,
+      "step": 61
+    },
+    {
+      "clip_ratio/high_max": 0.00018883940902014729,
+      "clip_ratio/high_mean": 6.83412895341462e-05,
+      "clip_ratio/low_mean": 0.00029582804199890234,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0003641693292593118,
+      "completions/clipped_ratio": 0.109375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15784.0,
+      "completions/mean_length": 8232.328125,
+      "completions/mean_terminated_length": 7231.24560546875,
+      "completions/min_length": 532.0,
+      "completions/min_terminated_length": 532.0,
+      "entropy": 0.4720785431563854,
+      "epoch": 0.028518859245630176,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0010464832885190845,
+      "learning_rate": 1e-05,
+      "loss": 0.0678,
+      "num_tokens": 21394763.0,
+      "reward": 0.421875,
+      "reward_std": 0.30617380142211914,
+      "rewards/accuracy_reward/mean": 0.421875,
+      "rewards/accuracy_reward/std": 0.49776285886764526,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999436736106873,
+      "sampling/importance_sampling_ratio/min": 0.05187493562698364,
+      "sampling/sampling_logp_difference/max": 2.9589195251464844,
+      "sampling/sampling_logp_difference/mean": 0.019340507686138153,
+      "step": 62
+    },
+    {
+      "clip_ratio/high_max": 7.807558085914934e-05,
+      "clip_ratio/high_mean": 2.2267657527663687e-05,
+      "clip_ratio/low_mean": 0.0001811299157452595,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.00020339757793408353,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15783.0,
+      "completions/mean_length": 6065.875,
+      "completions/mean_terminated_length": 5558.42578125,
+      "completions/min_length": 763.0,
+      "completions/min_terminated_length": 763.0,
+      "entropy": 0.5249982811510563,
+      "epoch": 0.028978840846366146,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0016154105542227626,
+      "learning_rate": 1e-05,
+      "loss": 0.1536,
+      "num_tokens": 21793091.0,
+      "reward": 0.40625,
+      "reward_std": 0.2756394147872925,
+      "rewards/accuracy_reward/mean": 0.40625,
+      "rewards/accuracy_reward/std": 0.49501484632492065,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998780488967896,
+      "sampling/importance_sampling_ratio/min": 0.05374135076999664,
+      "sampling/sampling_logp_difference/max": 2.923572540283203,
+      "sampling/sampling_logp_difference/mean": 0.017961012199521065,
+      "step": 63
+    },
+    {
+      "clip_ratio/high_max": 3.358934282005066e-05,
+      "clip_ratio/high_mean": 8.397335705012665e-06,
+      "clip_ratio/low_mean": 3.994480266555911e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.834213746107707e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15670.0,
+      "completions/mean_length": 5830.015625,
+      "completions/mean_terminated_length": 5489.564453125,
+      "completions/min_length": 487.0,
+      "completions/min_terminated_length": 487.0,
+      "entropy": 0.49247242510318756,
+      "epoch": 0.029438822447102116,
+      "frac_reward_zero_std": 0.625,
+      "grad_norm": 0.0013925280654802918,
+      "learning_rate": 1e-05,
+      "loss": 0.0145,
+      "num_tokens": 22176908.0,
+      "reward": 0.375,
+      "reward_std": 0.1872510462999344,
+      "rewards/accuracy_reward/mean": 0.375,
+      "rewards/accuracy_reward/std": 0.48795005679130554,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000190734863281,
+      "sampling/importance_sampling_ratio/min": 0.00015296634228434414,
+      "sampling/sampling_logp_difference/max": 8.785292625427246,
+      "sampling/sampling_logp_difference/mean": 0.016575772315263748,
+      "step": 64
+    }
+  ],
+  "logging_steps": 1,
+  "max_steps": 1024,
+  "num_input_tokens_seen": 22176908,
+  "num_train_epochs": 1,
+  "save_steps": 64,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 0.0,
+  "train_batch_size": 2,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/dapo_lora_7b_20251202_002719/checkpoint-64/zero_to_fp32.py b/dapo_lora_7b_20251202_002719/checkpoint-64/zero_to_fp32.py
new file mode 100644
index 0000000000000000000000000000000000000000..5995d6e6f04e43b989587aa9022a3aef0c66d694
--- /dev/null
+++ b/dapo_lora_7b_20251202_002719/checkpoint-64/zero_to_fp32.py
@@ -0,0 +1,760 @@
+#!/usr/bin/env python
+
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets
+# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in
+# the future. Once extracted, the weights don't require DeepSpeed and can be used in any
+# application.
+#
+# example:
+#   python zero_to_fp32.py . output_dir/
+#   or
+#   python zero_to_fp32.py . output_dir/ --safe_serialization
+
+import argparse
+import torch
+import glob
+import math
+import os
+import re
+import gc
+import json
+import numpy as np
+from tqdm import tqdm
+from collections import OrderedDict
+from dataclasses import dataclass
+
+# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with
+# DeepSpeed data structures it has to be available in the current python environment.
+from deepspeed.utils import logger
+from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS,
+                                            FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES,
+                                            FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS)
+
+
+@dataclass
+class zero_model_state:
+    buffers: dict()
+    param_shapes: dict()
+    shared_params: list
+    ds_version: int
+    frozen_param_shapes: dict()
+    frozen_param_fragments: dict()
+
+
+debug = 0
+
+# load to cpu
+device = torch.device('cpu')
+
+
+def atoi(text):
+    return int(text) if text.isdigit() else text
+
+
+def natural_keys(text):
+    '''
+    alist.sort(key=natural_keys) sorts in human order
+    http://nedbatchelder.com/blog/200712/human_sorting.html
+    (See Toothy's implementation in the comments)
+    '''
+    return [atoi(c) for c in re.split(r'(\d+)', text)]
+
+
+def get_model_state_file(checkpoint_dir, zero_stage):
+    if not os.path.isdir(checkpoint_dir):
+        raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist")
+
+    # there should be only one file
+    if zero_stage <= 2:
+        file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt")
+    elif zero_stage == 3:
+        file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt")
+
+    if not os.path.exists(file):
+        raise FileNotFoundError(f"can't find model states file at '{file}'")
+
+    return file
+
+
+def get_checkpoint_files(checkpoint_dir, glob_pattern):
+    # XXX: need to test that this simple glob rule works for multi-node setup too
+    ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys)
+
+    if len(ckpt_files) == 0:
+        raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'")
+
+    return ckpt_files
+
+
+def get_optim_files(checkpoint_dir):
+    return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt")
+
+
+def get_model_state_files(checkpoint_dir):
+    return get_checkpoint_files(checkpoint_dir, "*_model_states.pt")
+
+
+def parse_model_states(files):
+    zero_model_states = []
+    for file in files:
+        state_dict = torch.load(file, map_location=device, weights_only=False)
+
+        if BUFFER_NAMES not in state_dict:
+            raise ValueError(f"{file} is not a model state checkpoint")
+        buffer_names = state_dict[BUFFER_NAMES]
+        if debug:
+            print("Found buffers:", buffer_names)
+
+        # recover just the buffers while restoring them to fp32 if they were saved in fp16
+        buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names}
+        param_shapes = state_dict[PARAM_SHAPES]
+
+        # collect parameters that are included in param_shapes
+        param_names = []
+        for s in param_shapes:
+            for name in s.keys():
+                param_names.append(name)
+
+        # update with frozen parameters
+        frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None)
+        if frozen_param_shapes is not None:
+            if debug:
+                print(f"Found frozen_param_shapes: {frozen_param_shapes}")
+            param_names += list(frozen_param_shapes.keys())
+
+        # handle shared params
+        shared_params = [[k, v] for k, v in state_dict["shared_params"].items()]
+
+        ds_version = state_dict.get(DS_VERSION, None)
+
+        frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None)
+
+        z_model_state = zero_model_state(buffers=buffers,
+                                         param_shapes=param_shapes,
+                                         shared_params=shared_params,
+                                         ds_version=ds_version,
+                                         frozen_param_shapes=frozen_param_shapes,
+                                         frozen_param_fragments=frozen_param_fragments)
+        zero_model_states.append(z_model_state)
+
+    return zero_model_states
+
+
+def parse_optim_states(files, ds_checkpoint_dir):
+    total_files = len(files)
+    state_dicts = []
+    for f in tqdm(files, desc='Loading checkpoint shards'):
+        state_dict = torch.load(f, map_location=device, mmap=True, weights_only=False)
+        # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights
+        # and also handle the case where it was already removed by another helper script
+        state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None)
+        state_dicts.append(state_dict)
+
+    if ZERO_STAGE not in state_dicts[0][OPTIMIZER_STATE_DICT]:
+        raise ValueError(f"{files[0]} is not a zero checkpoint")
+    zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE]
+    world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT]
+
+    # For ZeRO-2 each param group can have different partition_count as data parallelism for expert
+    # parameters can be different from data parallelism for non-expert parameters. So we can just
+    # use the max of the partition_count to get the dp world_size.
+
+    if type(world_size) is list:
+        world_size = max(world_size)
+
+    if world_size != total_files:
+        raise ValueError(
+            f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. "
+            "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes."
+        )
+
+    # the groups are named differently in each stage
+    if zero_stage <= 2:
+        fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS
+    elif zero_stage == 3:
+        fp32_groups_key = FP32_FLAT_GROUPS
+    else:
+        raise ValueError(f"unknown zero stage {zero_stage}")
+
+    fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))]
+    return zero_stage, world_size, fp32_flat_groups
+
+
+def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters):
+    """
+    Returns fp32 state_dict reconstructed from ds checkpoint
+
+    Args:
+        - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are)
+
+    """
+    print(f"Processing zero checkpoint '{ds_checkpoint_dir}'")
+
+    optim_files = get_optim_files(ds_checkpoint_dir)
+    zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir)
+    print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}")
+
+    model_files = get_model_state_files(ds_checkpoint_dir)
+
+    zero_model_states = parse_model_states(model_files)
+    print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}')
+
+    if zero_stage <= 2:
+        return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+                                                          exclude_frozen_parameters)
+    elif zero_stage == 3:
+        return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+                                                          exclude_frozen_parameters)
+
+
+def _zero2_merge_frozen_params(state_dict, zero_model_states):
+    if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0:
+        return
+
+    frozen_param_shapes = zero_model_states[0].frozen_param_shapes
+    frozen_param_fragments = zero_model_states[0].frozen_param_fragments
+
+    if debug:
+        num_elem = sum(s.numel() for s in frozen_param_shapes.values())
+        print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}')
+
+        wanted_params = len(frozen_param_shapes)
+        wanted_numel = sum(s.numel() for s in frozen_param_shapes.values())
+        avail_numel = sum([p.numel() for p in frozen_param_fragments.values()])
+        print(f'Frozen params: Have {avail_numel} numels to process.')
+        print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params')
+
+    total_params = 0
+    total_numel = 0
+    for name, shape in frozen_param_shapes.items():
+        total_params += 1
+        unpartitioned_numel = shape.numel()
+        total_numel += unpartitioned_numel
+
+        state_dict[name] = frozen_param_fragments[name]
+
+        if debug:
+            print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ")
+
+    print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements")
+
+
+def _has_callable(obj, fn):
+    attr = getattr(obj, fn, None)
+    return callable(attr)
+
+
+def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states):
+    param_shapes = zero_model_states[0].param_shapes
+
+    # Reconstruction protocol:
+    #
+    # XXX: document this
+
+    if debug:
+        for i in range(world_size):
+            for j in range(len(fp32_flat_groups[0])):
+                print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}")
+
+    # XXX: memory usage doubles here (zero2)
+    num_param_groups = len(fp32_flat_groups[0])
+    merged_single_partition_of_fp32_groups = []
+    for i in range(num_param_groups):
+        merged_partitions = [sd[i] for sd in fp32_flat_groups]
+        full_single_fp32_vector = torch.cat(merged_partitions, 0)
+        merged_single_partition_of_fp32_groups.append(full_single_fp32_vector)
+    avail_numel = sum(
+        [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups])
+
+    if debug:
+        wanted_params = sum([len(shapes) for shapes in param_shapes])
+        wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes])
+        # not asserting if there is a mismatch due to possible padding
+        print(f"Have {avail_numel} numels to process.")
+        print(f"Need {wanted_numel} numels in {wanted_params} params.")
+
+    # params
+    # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support
+    # out-of-core computing solution
+    total_numel = 0
+    total_params = 0
+    for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups):
+        offset = 0
+        avail_numel = full_single_fp32_vector.numel()
+        for name, shape in shapes.items():
+
+            unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape)
+            total_numel += unpartitioned_numel
+            total_params += 1
+
+            if debug:
+                print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ")
+            state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape)
+            offset += unpartitioned_numel
+
+        # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and
+        # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex
+        # paddings performed in the code it's almost impossible to predict the exact numbers w/o the
+        # live optimizer object, so we are checking that the numbers are within the right range
+        align_to = 2 * world_size
+
+        def zero2_align(x):
+            return align_to * math.ceil(x / align_to)
+
+        if debug:
+            print(f"original offset={offset}, avail_numel={avail_numel}")
+
+        offset = zero2_align(offset)
+        avail_numel = zero2_align(avail_numel)
+
+        if debug:
+            print(f"aligned  offset={offset}, avail_numel={avail_numel}")
+
+        # Sanity check
+        if offset != avail_numel:
+            raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong")
+
+    print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements")
+
+
+def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+                                               exclude_frozen_parameters):
+    state_dict = OrderedDict()
+
+    # buffers
+    buffers = zero_model_states[0].buffers
+    state_dict.update(buffers)
+    if debug:
+        print(f"added {len(buffers)} buffers")
+
+    if not exclude_frozen_parameters:
+        _zero2_merge_frozen_params(state_dict, zero_model_states)
+
+    _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states)
+
+    # recover shared parameters
+    for pair in zero_model_states[0].shared_params:
+        if pair[1] in state_dict:
+            state_dict[pair[0]] = state_dict[pair[1]]
+
+    return state_dict
+
+
+def zero3_partitioned_param_info(unpartitioned_numel, world_size):
+    remainder = unpartitioned_numel % world_size
+    padding_numel = (world_size - remainder) if remainder else 0
+    partitioned_numel = math.ceil(unpartitioned_numel / world_size)
+    return partitioned_numel, padding_numel
+
+
+def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states):
+    if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0:
+        return
+
+    if debug:
+        for i in range(world_size):
+            num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values())
+            print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}')
+
+        frozen_param_shapes = zero_model_states[0].frozen_param_shapes
+        wanted_params = len(frozen_param_shapes)
+        wanted_numel = sum(s.numel() for s in frozen_param_shapes.values())
+        avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size
+        print(f'Frozen params: Have {avail_numel} numels to process.')
+        print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params')
+
+    total_params = 0
+    total_numel = 0
+    for name, shape in zero_model_states[0].frozen_param_shapes.items():
+        total_params += 1
+        unpartitioned_numel = shape.numel()
+        total_numel += unpartitioned_numel
+
+        param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states)
+        state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape)
+
+        partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size)
+
+        if debug:
+            print(
+                f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}"
+            )
+
+    print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements")
+
+
+class GatheredTensor:
+    """
+    A pseudo tensor that collects partitioned weights.
+    It is more memory efficient when there are multiple groups.
+    """
+
+    def __init__(self, flat_groups, flat_groups_offset, offset, partitioned_numel, shape):
+        self.flat_groups = flat_groups
+        self.flat_groups_offset = flat_groups_offset
+        self.offset = offset
+        self.partitioned_numel = partitioned_numel
+        self.shape = shape
+        self.dtype = self.flat_groups[0][0].dtype
+
+    def contiguous(self):
+        """
+        Merge partitioned weights from flat_groups into a single tensor.
+        """
+        end_idx = self.offset + self.partitioned_numel
+        world_size = len(self.flat_groups)
+        pad_flat_param_chunks = []
+
+        for rank_i in range(world_size):
+            # for each rank, we need to collect weights from related group/groups
+            flat_groups_at_rank_i = self.flat_groups[rank_i]
+            start_group_id = None
+            end_group_id = None
+            for group_id in range(len(self.flat_groups_offset)):
+                if self.flat_groups_offset[group_id] <= self.offset < self.flat_groups_offset[group_id + 1]:
+                    start_group_id = group_id
+                if self.flat_groups_offset[group_id] < end_idx <= self.flat_groups_offset[group_id + 1]:
+                    end_group_id = group_id
+                    break
+            # collect weights from related group/groups
+            for group_id in range(start_group_id, end_group_id + 1):
+                flat_tensor = flat_groups_at_rank_i[group_id]
+                start_offset = self.offset - self.flat_groups_offset[group_id]
+                end_offset = min(end_idx, self.flat_groups_offset[group_id + 1]) - self.flat_groups_offset[group_id]
+                pad_flat_param_chunks.append(flat_tensor[start_offset:end_offset])
+
+        # collect weights from all ranks
+        pad_flat_param = torch.cat(pad_flat_param_chunks, dim=0)
+        param = pad_flat_param[:self.shape.numel()].view(self.shape).contiguous()
+        return param
+
+
+def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states):
+    param_shapes = zero_model_states[0].param_shapes
+    avail_numel = sum([flat_group.numel() for flat_group in fp32_flat_groups[0]]) * world_size
+
+    # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each
+    # param, re-consolidating each param, while dealing with padding if any
+
+    # merge list of dicts, preserving order
+    param_shapes = {k: v for d in param_shapes for k, v in d.items()}
+
+    if debug:
+        for i in range(world_size):
+            print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}")
+
+        wanted_params = len(param_shapes)
+        wanted_numel = sum(shape.numel() for shape in param_shapes.values())
+        # not asserting if there is a mismatch due to possible padding
+        avail_numel = fp32_flat_groups[0].numel() * world_size
+        print(f"Trainable params: Have {avail_numel} numels to process.")
+        print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.")
+
+    # params
+    # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support
+    # out-of-core computing solution
+    offset = 0
+    total_numel = 0
+    total_params = 0
+    flat_groups_offset = [0] + list(np.cumsum([flat_tensor.numel() for flat_tensor in fp32_flat_groups[0]]))
+    for name, shape in tqdm(param_shapes.items(), desc='Gathering sharded weights'):
+        unpartitioned_numel = shape.numel()
+        total_numel += unpartitioned_numel
+        total_params += 1
+        partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size)
+
+        if debug:
+            print(
+                f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}"
+            )
+
+        # memory efficient tensor
+        tensor = GatheredTensor(fp32_flat_groups, flat_groups_offset, offset, partitioned_numel, shape)
+        state_dict[name] = tensor
+        offset += partitioned_numel
+
+    offset *= world_size
+
+    # Sanity check
+    if offset != avail_numel:
+        raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong")
+
+    print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements")
+
+
+def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+                                               exclude_frozen_parameters):
+    state_dict = OrderedDict()
+
+    # buffers
+    buffers = zero_model_states[0].buffers
+    state_dict.update(buffers)
+    if debug:
+        print(f"added {len(buffers)} buffers")
+
+    if not exclude_frozen_parameters:
+        _zero3_merge_frozen_params(state_dict, world_size, zero_model_states)
+
+    _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states)
+
+    # recover shared parameters
+    for pair in zero_model_states[0].shared_params:
+        if pair[1] in state_dict:
+            state_dict[pair[0]] = state_dict[pair[1]]
+
+    return state_dict
+
+
+def to_torch_tensor(state_dict, return_empty_tensor=False):
+    """
+    Convert state_dict of GatheredTensor to torch tensor
+    """
+    torch_state_dict = {}
+    converted_tensors = {}
+    for name, tensor in state_dict.items():
+        tensor_id = id(tensor)
+        if tensor_id in converted_tensors:  # shared tensors
+            shared_tensor = torch_state_dict[converted_tensors[tensor_id]]
+            torch_state_dict[name] = shared_tensor
+        else:
+            converted_tensors[tensor_id] = name
+            if return_empty_tensor:
+                torch_state_dict[name] = torch.empty(tensor.shape, dtype=tensor.dtype)
+            else:
+                torch_state_dict[name] = tensor.contiguous()
+    return torch_state_dict
+
+
+def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir,
+                                             tag=None,
+                                             exclude_frozen_parameters=False,
+                                             lazy_mode=False):
+    """
+    Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with
+    ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example
+    via a model hub.
+
+    Args:
+        - ``checkpoint_dir``: path to the desired checkpoint folder
+        - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14``
+        - ``exclude_frozen_parameters``: exclude frozen parameters
+        - ``lazy_mode``: get state_dict in lazy mode. It returns a dict of pesduo tensor instead of torch tensor, which is more memory efficient.
+          Convert the pesduo tensor to torch tensor by ``.contiguous()``
+
+    Returns:
+        - pytorch ``state_dict``
+
+    A typical usage might be ::
+
+        from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint
+        # do the training and checkpoint saving
+        state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu
+        model = model.cpu() # move to cpu
+        model.load_state_dict(state_dict)
+        # submit to model hub or save the model to share with others
+
+    In this example the ``model`` will no longer be usable in the deepspeed context of the same
+    application. i.e. you will need to re-initialize the deepspeed engine, since
+    ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it.
+
+    If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead.
+
+    Note: the above usage may not work if your application doesn't have sufficient free CPU memory.
+    You may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with
+    the checkpoint. Or you can load state_dict in lazy mode ::
+
+        from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint
+        state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, lazy_mode=True) # not on cpu
+        for name, lazy_tensor in state_dict.item():
+            tensor = lazy_tensor.contiguous()  # to cpu
+            print(name, tensor)
+            # del tensor to release memory if it no longer in use
+    """
+    if tag is None:
+        latest_path = os.path.join(checkpoint_dir, 'latest')
+        if os.path.isfile(latest_path):
+            with open(latest_path, 'r') as fd:
+                tag = fd.read().strip()
+        else:
+            raise ValueError(f"Unable to find 'latest' file at {latest_path}")
+
+    ds_checkpoint_dir = os.path.join(checkpoint_dir, tag)
+
+    if not os.path.isdir(ds_checkpoint_dir):
+        raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist")
+
+    state_dict = _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters)
+    if lazy_mode:
+        return state_dict
+    else:
+        return to_torch_tensor(state_dict)
+
+
+def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir,
+                                               output_dir,
+                                               max_shard_size="5GB",
+                                               safe_serialization=False,
+                                               tag=None,
+                                               exclude_frozen_parameters=False):
+    """
+    Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be
+    loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed.
+
+    Args:
+        - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``)
+        - ``output_dir``: directory to the pytorch fp32 state_dict output files
+        - ``max_shard_size``: the maximum size for a checkpoint before being sharded, default value is 5GB
+        - ``safe_serialization``:  whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).
+        - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14``
+        - ``exclude_frozen_parameters``: exclude frozen parameters
+    """
+
+    # Dependency pre-check
+    if safe_serialization:
+        try:
+            from safetensors.torch import save_file
+        except ImportError:
+            print('If you want to use `safe_serialization`, please `pip install safetensors`')
+            raise
+    if max_shard_size is not None:
+        try:
+            from huggingface_hub import split_torch_state_dict_into_shards
+        except ImportError:
+            print('If you want to use `max_shard_size`, please `pip install huggingface_hub`')
+            raise
+
+    # Convert zero checkpoint to state_dict
+    state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir,
+                                                          tag,
+                                                          exclude_frozen_parameters,
+                                                          lazy_mode=True)
+
+    # Shard the model if it is too big.
+    weights_name = "model.safetensors" if safe_serialization else "pytorch_model.bin"
+    if max_shard_size is not None:
+        filename_pattern = weights_name.replace(".bin", "{suffix}.bin").replace(".safetensors", "{suffix}.safetensors")
+        # an memory-efficient approach for sharding
+        empty_state_dict = to_torch_tensor(state_dict, return_empty_tensor=True)
+        state_dict_split = split_torch_state_dict_into_shards(empty_state_dict,
+                                                              filename_pattern=filename_pattern,
+                                                              max_shard_size=max_shard_size)
+    else:
+        from collections import namedtuple
+        StateDictSplit = namedtuple("StateDictSplit", ["is_sharded", "filename_to_tensors"])
+        state_dict_split = StateDictSplit(is_sharded=False,
+                                          filename_to_tensors={weights_name: list(state_dict.keys())})
+
+    # Save the model by shard
+    os.makedirs(output_dir, exist_ok=True)
+    filename_to_tensors = state_dict_split.filename_to_tensors.items()
+    for shard_file, tensors in tqdm(filename_to_tensors, desc="Saving checkpoint shards"):
+        shard_state_dict = {tensor_name: state_dict[tensor_name] for tensor_name in tensors}
+        shard_state_dict = to_torch_tensor(shard_state_dict)
+        output_path = os.path.join(output_dir, shard_file)
+        if safe_serialization:
+            save_file(shard_state_dict, output_path, metadata={"format": "pt"})
+        else:
+            torch.save(shard_state_dict, output_path)
+        # release the memory of current shard
+        for tensor_name in list(shard_state_dict.keys()):
+            del state_dict[tensor_name]
+            del shard_state_dict[tensor_name]
+        del shard_state_dict
+        gc.collect()
+
+    # Save index if sharded
+    if state_dict_split.is_sharded:
+        index = {
+            "metadata": state_dict_split.metadata,
+            "weight_map": state_dict_split.tensor_to_filename,
+        }
+        save_index_file = "model.safetensors.index.json" if safe_serialization else "pytorch_model.bin.index.json"
+        save_index_file = os.path.join(output_dir, save_index_file)
+        with open(save_index_file, "w", encoding="utf-8") as f:
+            content = json.dumps(index, indent=2, sort_keys=True) + "\n"
+            f.write(content)
+
+
+def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None):
+    """
+    1. Put the provided model to cpu
+    2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict``
+    3. Load it into the provided model
+
+    Args:
+        - ``model``: the model object to update
+        - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``)
+        - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14``
+
+    Returns:
+        - ``model`: modified model
+
+    Make sure you have plenty of CPU memory available before you call this function. If you don't
+    have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it
+    conveniently placed for you in the checkpoint folder.
+
+    A typical usage might be ::
+
+        from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint
+        model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir)
+        # submit to model hub or save the model to share with others
+
+    Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context
+    of the same application. i.e. you will need to re-initialize the deepspeed engine, since
+    ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it.
+
+    """
+    logger.info("Extracting fp32 weights")
+    state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag)
+
+    logger.info("Overwriting model with fp32 weights")
+    model = model.cpu()
+    model.load_state_dict(state_dict, strict=False)
+
+    return model
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("checkpoint_dir",
+                        type=str,
+                        help="path to the desired checkpoint folder, e.g., path/checkpoint-12")
+    parser.add_argument("output_dir",
+                        type=str,
+                        help="directory to the pytorch fp32 state_dict output files"
+                        "(e.g. path/checkpoint-12-output/)")
+    parser.add_argument(
+        "--max_shard_size",
+        type=str,
+        default="5GB",
+        help="The maximum size for a checkpoint before being sharded. Checkpoints shard will then be each of size"
+        "lower than this size. If expressed as a string, needs to be digits followed by a unit (like `5MB`"
+        "We default it to 5GB in order for models to be able to run easily on free-tier google colab instances"
+        "without CPU OOM issues.")
+    parser.add_argument(
+        "--safe_serialization",
+        default=False,
+        action='store_true',
+        help="Whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).")
+    parser.add_argument("-t",
+                        "--tag",
+                        type=str,
+                        default=None,
+                        help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1")
+    parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters")
+    parser.add_argument("-d", "--debug", action='store_true', help="enable debug")
+    args = parser.parse_args()
+
+    debug = args.debug
+
+    convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir,
+                                               args.output_dir,
+                                               max_shard_size=args.max_shard_size,
+                                               safe_serialization=args.safe_serialization,
+                                               tag=args.tag,
+                                               exclude_frozen_parameters=args.exclude_frozen_parameters)
diff --git a/dapo_lora_plus_20251202_001141/checkpoint-128/README.md b/dapo_lora_plus_20251202_001141/checkpoint-128/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..b3fac4aca7a7fabb3a0972e6c9281e23853e2816
--- /dev/null
+++ b/dapo_lora_plus_20251202_001141/checkpoint-128/README.md
@@ -0,0 +1,209 @@
+---
+base_model: deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B
+- grpo
+- lora
+- transformers
+- trl
+---
+
+# Model Card for Model ID
+
+<!-- Provide a quick summary of what the model is/does. -->
+
+
+
+## Model Details
+
+### Model Description
+
+<!-- Provide a longer summary of what this model is. -->
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+<!-- Provide the basic links for the model. -->
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+
+### Direct Use
+
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+
+[More Information Needed]
+
+### Recommendations
+
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+
+[More Information Needed]
+
+### Training Procedure
+
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+
+#### Speeds, Sizes, Times [optional]
+
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+
+[More Information Needed]
+
+## Evaluation
+
+<!-- This section describes the evaluation protocols and provides the results. -->
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+<!-- This should link to a Dataset Card if possible. -->
+
+[More Information Needed]
+
+#### Factors
+
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+
+[More Information Needed]
+
+#### Metrics
+
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+<!-- Relevant interpretability work for the model goes here -->
+
+[More Information Needed]
+
+## Environmental Impact
+
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.17.1
\ No newline at end of file
diff --git a/dapo_lora_plus_20251202_001141/checkpoint-128/adapter_config.json b/dapo_lora_plus_20251202_001141/checkpoint-128/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..57b1340e85011632bb78b2fd3b13b455f6b0d622
--- /dev/null
+++ b/dapo_lora_plus_20251202_001141/checkpoint-128/adapter_config.json
@@ -0,0 +1,42 @@
+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B",
+  "bias": "none",
+  "corda_config": null,
+  "eva_config": null,
+  "exclude_modules": null,
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 64,
+  "lora_bias": false,
+  "lora_dropout": 0.05,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "qalora_group_size": 16,
+  "r": 32,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "q_proj",
+    "v_proj",
+    "k_proj",
+    "gate_proj",
+    "down_proj",
+    "up_proj",
+    "o_proj"
+  ],
+  "target_parameters": null,
+  "task_type": "CAUSAL_LM",
+  "trainable_token_indices": null,
+  "use_dora": false,
+  "use_qalora": false,
+  "use_rslora": false
+}
\ No newline at end of file
diff --git a/dapo_lora_plus_20251202_001141/checkpoint-128/chat_template.jinja b/dapo_lora_plus_20251202_001141/checkpoint-128/chat_template.jinja
new file mode 100644
index 0000000000000000000000000000000000000000..c2066bd7391c270626e39c9d7124f00360126412
--- /dev/null
+++ b/dapo_lora_plus_20251202_001141/checkpoint-128/chat_template.jinja
@@ -0,0 +1 @@
+{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% set ns = namespace(is_first=false, is_tool=false, is_output_first=true, system_prompt='') %}{%- for message in messages %}{%- if message['role'] == 'system' %}{% set ns.system_prompt = message['content'] %}{%- endif %}{%- endfor %}{{bos_token}}{{ns.system_prompt}}{%- for message in messages %}{%- if message['role'] == 'user' %}{%- set ns.is_tool = false -%}{{'<｜User｜>' + message['content']}}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is none %}{%- set ns.is_tool = false -%}{%- for tool in message['tool_calls']%}{%- if not ns.is_first %}{{'<｜Assistant｜><｜tool▁calls▁begin｜><｜tool▁call▁begin｜>' + tool['type'] + '<｜tool▁sep｜>' + tool['function']['name'] + '\n' + '```json' + '\n' + tool['function']['arguments'] + '\n' + '```' + '<｜tool▁call▁end｜>'}}{%- set ns.is_first = true -%}{%- else %}{{'\n' + '<｜tool▁call▁begin｜>' + tool['type'] + '<｜tool▁sep｜>' + tool['function']['name'] + '\n' + '```json' + '\n' + tool['function']['arguments'] + '\n' + '```' + '<｜tool▁call▁end｜>'}}{{'<｜tool▁calls▁end｜><｜end▁of▁sentence｜>'}}{%- endif %}{%- endfor %}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is not none %}{%- if ns.is_tool %}{{'<｜tool▁outputs▁end｜>' + message['content'] + '<｜end▁of▁sentence｜>'}}{%- set ns.is_tool = false -%}{%- else %}{% set content = message['content'] %}{% if '</think>' in content %}{% set content = content.split('</think>')[-1] %}{% endif %}{{'<｜Assistant｜>' + content + '<｜end▁of▁sentence｜>'}}{%- endif %}{%- endif %}{%- if message['role'] == 'tool' %}{%- set ns.is_tool = true -%}{%- if ns.is_output_first %}{{'<｜tool▁outputs▁begin｜><｜tool▁output▁begin｜>' + message['content'] + '<｜tool▁output▁end｜>'}}{%- set ns.is_output_first = false %}{%- else %}{{'\n<｜tool▁output▁begin｜>' + message['content'] + '<｜tool▁output▁end｜>'}}{%- endif %}{%- endif %}{%- endfor -%}{% if ns.is_tool %}{{'<｜tool▁outputs▁end｜>'}}{% endif %}{% if add_generation_prompt and not ns.is_tool %}{{'<｜Assistant｜><think>\n'}}{% endif %}
\ No newline at end of file
diff --git a/dapo_lora_plus_20251202_001141/checkpoint-128/latest b/dapo_lora_plus_20251202_001141/checkpoint-128/latest
new file mode 100644
index 0000000000000000000000000000000000000000..b4db7fb020d9ef75e52048bf0cde7481e3ef9351
--- /dev/null
+++ b/dapo_lora_plus_20251202_001141/checkpoint-128/latest
@@ -0,0 +1 @@
+global_step128
\ No newline at end of file
diff --git a/dapo_lora_plus_20251202_001141/checkpoint-128/special_tokens_map.json b/dapo_lora_plus_20251202_001141/checkpoint-128/special_tokens_map.json
new file mode 100644
index 0000000000000000000000000000000000000000..1d385d62cf08bca35254547902b792c243656ec1
--- /dev/null
+++ b/dapo_lora_plus_20251202_001141/checkpoint-128/special_tokens_map.json
@@ -0,0 +1,23 @@
+{
+  "bos_token": {
+    "content": "<｜begin▁of▁sentence｜>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "<｜end▁of▁sentence｜>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<｜end▁of▁sentence｜>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}
diff --git a/dapo_lora_plus_20251202_001141/checkpoint-128/tokenizer_config.json b/dapo_lora_plus_20251202_001141/checkpoint-128/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..d252dd4e5764106823080946500c02a8ed8c90c9
--- /dev/null
+++ b/dapo_lora_plus_20251202_001141/checkpoint-128/tokenizer_config.json
@@ -0,0 +1,194 @@
+{
+  "add_bos_token": true,
+  "add_eos_token": false,
+  "add_prefix_space": null,
+  "added_tokens_decoder": {
+    "151643": {
+      "content": "<｜end▁of▁sentence｜>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151644": {
+      "content": "<｜User｜>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151645": {
+      "content": "<｜Assistant｜>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151646": {
+      "content": "<｜begin▁of▁sentence｜>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151647": {
+      "content": "<|EOT|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151648": {
+      "content": "<think>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151649": {
+      "content": "</think>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151650": {
+      "content": "<|quad_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151651": {
+      "content": "<|quad_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151652": {
+      "content": "<|vision_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151653": {
+      "content": "<|vision_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151654": {
+      "content": "<|vision_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151655": {
+      "content": "<|image_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151656": {
+      "content": "<|video_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151657": {
+      "content": "<tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151658": {
+      "content": "</tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151659": {
+      "content": "<|fim_prefix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151660": {
+      "content": "<|fim_middle|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151661": {
+      "content": "<|fim_suffix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151662": {
+      "content": "<|fim_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151663": {
+      "content": "<|repo_name|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151664": {
+      "content": "<|file_sep|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    }
+  },
+  "bos_token": "<｜begin▁of▁sentence｜>",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<｜end▁of▁sentence｜>",
+  "extra_special_tokens": {},
+  "legacy": true,
+  "model_max_length": 16384,
+  "pad_token": "<｜end▁of▁sentence｜>",
+  "sp_model_kwargs": {},
+  "tokenizer_class": "LlamaTokenizerFast",
+  "unk_token": null,
+  "use_default_system_prompt": false
+}
diff --git a/dapo_lora_plus_20251202_001141/checkpoint-128/trainer_state.json b/dapo_lora_plus_20251202_001141/checkpoint-128/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..7554bec987cbd24bb2cef715f0fe73e0a1ecbcbd
--- /dev/null
+++ b/dapo_lora_plus_20251202_001141/checkpoint-128/trainer_state.json
@@ -0,0 +1,4002 @@
+{
+  "best_global_step": null,
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 0.11775528978840846,
+  "eval_steps": 500,
+  "global_step": 128,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15689.0,
+      "completions/max_terminated_length": 15689.0,
+      "completions/mean_length": 6039.171875,
+      "completions/mean_terminated_length": 6039.171875,
+      "completions/min_length": 250.0,
+      "completions/min_terminated_length": 250.0,
+      "entropy": 1.19118632376194,
+      "epoch": 0.0009199632014719411,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0025745572056621313,
+      "learning_rate": 1e-05,
+      "loss": 0.0591,
+      "num_tokens": 792270.0,
+      "reward": 0.25,
+      "reward_std": 0.24435341358184814,
+      "rewards/accuracy_reward/mean": 0.25,
+      "rewards/accuracy_reward/std": 0.434714138507843,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999940395355225,
+      "sampling/importance_sampling_ratio/min": 0.0002457273658365011,
+      "sampling/sampling_logp_difference/max": 8.311287879943848,
+      "sampling/sampling_logp_difference/mean": 0.021642697975039482,
+      "step": 1
+    },
+    {
+      "clip_ratio/high_max": 5.499582130141789e-06,
+      "clip_ratio/high_mean": 1.3748955325354473e-06,
+      "clip_ratio/low_mean": 2.871888784738985e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.009378326623846e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 16292.0,
+      "completions/max_terminated_length": 16292.0,
+      "completions/mean_length": 4767.1875,
+      "completions/mean_terminated_length": 4767.1875,
+      "completions/min_length": 556.0,
+      "completions/min_terminated_length": 556.0,
+      "entropy": 1.088237851858139,
+      "epoch": 0.0018399264029438822,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.002068034838885069,
+      "learning_rate": 1e-05,
+      "loss": 0.0258,
+      "num_tokens": 1425798.0,
+      "reward": 0.3046875,
+      "reward_std": 0.29036980867385864,
+      "rewards/accuracy_reward/mean": 0.3046875,
+      "rewards/accuracy_reward/std": 0.46208351850509644,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999016523361206,
+      "sampling/importance_sampling_ratio/min": 0.01811397261917591,
+      "sampling/sampling_logp_difference/max": 4.011071681976318,
+      "sampling/sampling_logp_difference/mean": 0.01877593621611595,
+      "step": 2
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 4.459846724103045e-05,
+      "clip_ratio/low_min": 3.4060874440910993e-06,
+      "clip_ratio/region_mean": 4.459846724103045e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16317.0,
+      "completions/mean_length": 6586.359375,
+      "completions/mean_terminated_length": 6351.21630859375,
+      "completions/min_length": 4.0,
+      "completions/min_terminated_length": 4.0,
+      "entropy": 1.0497623533010483,
+      "epoch": 0.0027598896044158236,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.001971944235265255,
+      "learning_rate": 1e-05,
+      "loss": 0.0199,
+      "num_tokens": 2287420.0,
+      "reward": 0.28125,
+      "reward_std": 0.29143062233924866,
+      "rewards/accuracy_reward/mean": 0.28125,
+      "rewards/accuracy_reward/std": 0.4513758420944214,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999316334724426,
+      "sampling/importance_sampling_ratio/min": 5.356698966352269e-05,
+      "sampling/sampling_logp_difference/max": 9.834577560424805,
+      "sampling/sampling_logp_difference/mean": 0.02137824520468712,
+      "step": 3
+    },
+    {
+      "clip_ratio/high_max": 1.7640652004047297e-05,
+      "clip_ratio/high_mean": 5.48578327652649e-06,
+      "clip_ratio/low_mean": 3.218628648937738e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.767206976590387e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 14690.0,
+      "completions/max_terminated_length": 14690.0,
+      "completions/mean_length": 5448.0234375,
+      "completions/mean_terminated_length": 5448.0234375,
+      "completions/min_length": 707.0,
+      "completions/min_terminated_length": 707.0,
+      "entropy": 1.1134418621659279,
+      "epoch": 0.0036798528058877645,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0016465173102915287,
+      "learning_rate": 1e-05,
+      "loss": 0.0433,
+      "num_tokens": 3009167.0,
+      "reward": 0.2890625,
+      "reward_std": 0.27958330512046814,
+      "rewards/accuracy_reward/mean": 0.2890625,
+      "rewards/accuracy_reward/std": 0.45510825514793396,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999774694442749,
+      "sampling/importance_sampling_ratio/min": 7.889385415182915e-06,
+      "sampling/sampling_logp_difference/max": 11.749992370605469,
+      "sampling/sampling_logp_difference/mean": 0.020580951124429703,
+      "step": 4
+    },
+    {
+      "clip_ratio/high_max": 1.3439519989333348e-05,
+      "clip_ratio/high_mean": 3.359879997333337e-06,
+      "clip_ratio/low_mean": 2.8849915906903334e-05,
+      "clip_ratio/low_min": 8.467687621305231e-06,
+      "clip_ratio/region_mean": 3.220979442630778e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 13420.0,
+      "completions/mean_length": 5436.8671875,
+      "completions/mean_terminated_length": 5350.66943359375,
+      "completions/min_length": 26.0,
+      "completions/min_terminated_length": 26.0,
+      "entropy": 1.1473859176039696,
+      "epoch": 0.004599816007359705,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0023770295083522797,
+      "learning_rate": 1e-05,
+      "loss": 0.0153,
+      "num_tokens": 3725654.0,
+      "reward": 0.2734375,
+      "reward_std": 0.27434611320495605,
+      "rewards/accuracy_reward/mean": 0.2734375,
+      "rewards/accuracy_reward/std": 0.447474867105484,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.99991774559021,
+      "sampling/importance_sampling_ratio/min": 0.0011146117467433214,
+      "sampling/sampling_logp_difference/max": 6.799249172210693,
+      "sampling/sampling_logp_difference/mean": 0.020377254113554955,
+      "step": 5
+    },
+    {
+      "clip_ratio/high_max": 4.652201369026443e-06,
+      "clip_ratio/high_mean": 1.1630503422566107e-06,
+      "clip_ratio/low_mean": 2.8399212624208303e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.9562263534899103e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 14440.0,
+      "completions/max_terminated_length": 14440.0,
+      "completions/mean_length": 4697.5390625,
+      "completions/mean_terminated_length": 4697.5390625,
+      "completions/min_length": 445.0,
+      "completions/min_terminated_length": 445.0,
+      "entropy": 1.0097229778766632,
+      "epoch": 0.005519779208831647,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.003342699259519577,
+      "learning_rate": 1e-05,
+      "loss": 0.0326,
+      "num_tokens": 4345547.0,
+      "reward": 0.390625,
+      "reward_std": 0.34480881690979004,
+      "rewards/accuracy_reward/mean": 0.390625,
+      "rewards/accuracy_reward/std": 0.4898075461387634,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999914765357971,
+      "sampling/importance_sampling_ratio/min": 0.002385853324085474,
+      "sampling/sampling_logp_difference/max": 6.038198471069336,
+      "sampling/sampling_logp_difference/mean": 0.0185473021119833,
+      "step": 6
+    },
+    {
+      "clip_ratio/high_max": 9.362594937556423e-06,
+      "clip_ratio/high_mean": 2.340648734389106e-06,
+      "clip_ratio/low_mean": 6.054362825125281e-05,
+      "clip_ratio/low_min": 7.427356649714056e-06,
+      "clip_ratio/region_mean": 6.288427744038927e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14652.0,
+      "completions/mean_length": 6218.2109375,
+      "completions/mean_terminated_length": 5890.2822265625,
+      "completions/min_length": 156.0,
+      "completions/min_terminated_length": 156.0,
+      "entropy": 1.0579778030514717,
+      "epoch": 0.006439742410303588,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.002073560608550906,
+      "learning_rate": 1e-05,
+      "loss": 0.0201,
+      "num_tokens": 5160646.0,
+      "reward": 0.2109375,
+      "reward_std": 0.27222445607185364,
+      "rewards/accuracy_reward/mean": 0.2109375,
+      "rewards/accuracy_reward/std": 0.4095771610736847,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999560117721558,
+      "sampling/importance_sampling_ratio/min": 0.00044544730917550623,
+      "sampling/sampling_logp_difference/max": 7.716431617736816,
+      "sampling/sampling_logp_difference/mean": 0.020321575924754143,
+      "step": 7
+    },
+    {
+      "clip_ratio/high_max": 1.1064067621191498e-05,
+      "clip_ratio/high_mean": 2.7660169052978745e-06,
+      "clip_ratio/low_mean": 2.2175867059104348e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.4941883737028547e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 13637.0,
+      "completions/mean_length": 5127.8359375,
+      "completions/mean_terminated_length": 5039.20458984375,
+      "completions/min_length": 556.0,
+      "completions/min_terminated_length": 556.0,
+      "entropy": 1.0472618415951729,
+      "epoch": 0.007359705611775529,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0032994600478559732,
+      "learning_rate": 1e-05,
+      "loss": 0.0751,
+      "num_tokens": 5836289.0,
+      "reward": 0.3359375,
+      "reward_std": 0.2948455810546875,
+      "rewards/accuracy_reward/mean": 0.3359375,
+      "rewards/accuracy_reward/std": 0.47417303919792175,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999483227729797,
+      "sampling/importance_sampling_ratio/min": 0.0013780994340777397,
+      "sampling/sampling_logp_difference/max": 6.587049961090088,
+      "sampling/sampling_logp_difference/mean": 0.01940803974866867,
+      "step": 8
+    },
+    {
+      "clip_ratio/high_max": 1.2357884770608507e-05,
+      "clip_ratio/high_mean": 3.0894711926521268e-06,
+      "clip_ratio/low_mean": 3.000627111759968e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.309574231025181e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15916.0,
+      "completions/mean_length": 4516.890625,
+      "completions/mean_terminated_length": 4423.44873046875,
+      "completions/min_length": 238.0,
+      "completions/min_terminated_length": 238.0,
+      "entropy": 0.911251038312912,
+      "epoch": 0.00827966881324747,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.003016560571268201,
+      "learning_rate": 1e-05,
+      "loss": 0.1006,
+      "num_tokens": 6433171.0,
+      "reward": 0.390625,
+      "reward_std": 0.3066929578781128,
+      "rewards/accuracy_reward/mean": 0.390625,
+      "rewards/accuracy_reward/std": 0.4898075461387634,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999179840087891,
+      "sampling/importance_sampling_ratio/min": 0.005480794236063957,
+      "sampling/sampling_logp_difference/max": 5.206505298614502,
+      "sampling/sampling_logp_difference/mean": 0.017437148839235306,
+      "step": 9
+    },
+    {
+      "clip_ratio/high_max": 4.6329013457580004e-05,
+      "clip_ratio/high_mean": 1.1582253364395001e-05,
+      "clip_ratio/low_mean": 7.069455705277505e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 8.227681109929108e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 13970.0,
+      "completions/mean_length": 4961.453125,
+      "completions/mean_terminated_length": 4687.31201171875,
+      "completions/min_length": 483.0,
+      "completions/min_terminated_length": 483.0,
+      "entropy": 0.6808596402406693,
+      "epoch": 0.00919963201471941,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.0035386616364121437,
+      "learning_rate": 1e-05,
+      "loss": 0.0596,
+      "num_tokens": 7085389.0,
+      "reward": 0.5625,
+      "reward_std": 0.3816363215446472,
+      "rewards/accuracy_reward/mean": 0.5625,
+      "rewards/accuracy_reward/std": 0.49802759289741516,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999173879623413,
+      "sampling/importance_sampling_ratio/min": 0.0002734088629949838,
+      "sampling/sampling_logp_difference/max": 8.20454216003418,
+      "sampling/sampling_logp_difference/mean": 0.01566406339406967,
+      "step": 10
+    },
+    {
+      "clip_ratio/high_max": 2.43190661421977e-05,
+      "clip_ratio/high_mean": 6.079766535549425e-06,
+      "clip_ratio/low_mean": 2.2395396172214532e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.8475162707763957e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14776.0,
+      "completions/mean_length": 4429.40625,
+      "completions/mean_terminated_length": 4335.275390625,
+      "completions/min_length": 25.0,
+      "completions/min_terminated_length": 25.0,
+      "entropy": 0.9181502386927605,
+      "epoch": 0.010119595216191352,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.0022535293828696012,
+      "learning_rate": 1e-05,
+      "loss": 0.0031,
+      "num_tokens": 7672185.0,
+      "reward": 0.3671875,
+      "reward_std": 0.20357418060302734,
+      "rewards/accuracy_reward/mean": 0.3671875,
+      "rewards/accuracy_reward/std": 0.4839322865009308,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998801946640015,
+      "sampling/importance_sampling_ratio/min": 5.315856554943821e-08,
+      "sampling/sampling_logp_difference/max": 16.74998664855957,
+      "sampling/sampling_logp_difference/mean": 0.018429335206747055,
+      "step": 11
+    },
+    {
+      "clip_ratio/high_max": 1.0117325928149512e-05,
+      "clip_ratio/high_mean": 2.529331482037378e-06,
+      "clip_ratio/low_mean": 1.1982813475697185e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 1.45121450714214e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14029.0,
+      "completions/mean_length": 5282.6796875,
+      "completions/mean_terminated_length": 5106.46875,
+      "completions/min_length": 323.0,
+      "completions/min_terminated_length": 323.0,
+      "entropy": 1.113751620054245,
+      "epoch": 0.011039558417663294,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0013591813622042537,
+      "learning_rate": 1e-05,
+      "loss": 0.0971,
+      "num_tokens": 8369000.0,
+      "reward": 0.3984375,
+      "reward_std": 0.3029736578464508,
+      "rewards/accuracy_reward/mean": 0.3984375,
+      "rewards/accuracy_reward/std": 0.4915000796318054,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998897314071655,
+      "sampling/importance_sampling_ratio/min": 3.970265970565379e-05,
+      "sampling/sampling_logp_difference/max": 10.134092330932617,
+      "sampling/sampling_logp_difference/mean": 0.020221836864948273,
+      "step": 12
+    },
+    {
+      "clip_ratio/high_max": 5.411958227341529e-06,
+      "clip_ratio/high_mean": 1.3529895568353822e-06,
+      "clip_ratio/low_mean": 2.5284593846208736e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.6637583516730956e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15925.0,
+      "completions/mean_length": 6970.421875,
+      "completions/mean_terminated_length": 6744.49609375,
+      "completions/min_length": 53.0,
+      "completions/min_terminated_length": 53.0,
+      "entropy": 1.1721933633089066,
+      "epoch": 0.011959521619135235,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.0024079051800072193,
+      "learning_rate": 1e-05,
+      "loss": 0.0713,
+      "num_tokens": 9283182.0,
+      "reward": 0.171875,
+      "reward_std": 0.17965975403785706,
+      "rewards/accuracy_reward/mean": 0.171875,
+      "rewards/accuracy_reward/std": 0.3787541687488556,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999163746833801,
+      "sampling/importance_sampling_ratio/min": 0.0008915197686292231,
+      "sampling/sampling_logp_difference/max": 7.0225830078125,
+      "sampling/sampling_logp_difference/mean": 0.021462474018335342,
+      "step": 13
+    },
+    {
+      "clip_ratio/high_max": 2.0661535927501973e-05,
+      "clip_ratio/high_mean": 5.165383981875493e-06,
+      "clip_ratio/low_mean": 2.4304956298237812e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.947033948430544e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 14658.0,
+      "completions/max_terminated_length": 14658.0,
+      "completions/mean_length": 4886.875,
+      "completions/mean_terminated_length": 4886.875,
+      "completions/min_length": 434.0,
+      "completions/min_terminated_length": 434.0,
+      "entropy": 1.0108910650014877,
+      "epoch": 0.012879484820607176,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.002063734456896782,
+      "learning_rate": 1e-05,
+      "loss": 0.0386,
+      "num_tokens": 9928446.0,
+      "reward": 0.3515625,
+      "reward_std": 0.2409384697675705,
+      "rewards/accuracy_reward/mean": 0.3515625,
+      "rewards/accuracy_reward/std": 0.4793342351913452,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000026226043701,
+      "sampling/importance_sampling_ratio/min": 0.0003672837920021266,
+      "sampling/sampling_logp_difference/max": 7.9093756675720215,
+      "sampling/sampling_logp_difference/mean": 0.01918785460293293,
+      "step": 14
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 4.4761846993424115e-06,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.4761846993424115e-06,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 12992.0,
+      "completions/max_terminated_length": 12992.0,
+      "completions/mean_length": 4824.0078125,
+      "completions/mean_terminated_length": 4824.0078125,
+      "completions/min_length": 103.0,
+      "completions/min_terminated_length": 103.0,
+      "entropy": 1.1070282831788063,
+      "epoch": 0.013799448022079117,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.002424790756776929,
+      "learning_rate": 1e-05,
+      "loss": 0.0485,
+      "num_tokens": 10566415.0,
+      "reward": 0.28125,
+      "reward_std": 0.23698672652244568,
+      "rewards/accuracy_reward/mean": 0.28125,
+      "rewards/accuracy_reward/std": 0.4513758420944214,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000125169754028,
+      "sampling/importance_sampling_ratio/min": 0.0011708867968991399,
+      "sampling/sampling_logp_difference/max": 6.749993801116943,
+      "sampling/sampling_logp_difference/mean": 0.02069389820098877,
+      "step": 15
+    },
+    {
+      "clip_ratio/high_max": 3.5075904634140898e-06,
+      "clip_ratio/high_mean": 8.768976158535224e-07,
+      "clip_ratio/low_mean": 2.2676964135825983e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.3553861751679506e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 12685.0,
+      "completions/mean_length": 5449.4140625,
+      "completions/mean_terminated_length": 5363.31494140625,
+      "completions/min_length": 23.0,
+      "completions/min_terminated_length": 23.0,
+      "entropy": 0.9817888736724854,
+      "epoch": 0.014719411223551058,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0021046048495918512,
+      "learning_rate": 1e-05,
+      "loss": 0.0252,
+      "num_tokens": 11281908.0,
+      "reward": 0.2265625,
+      "reward_std": 0.27168765664100647,
+      "rewards/accuracy_reward/mean": 0.2265625,
+      "rewards/accuracy_reward/std": 0.4202519655227661,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999805688858032,
+      "sampling/importance_sampling_ratio/min": 0.013273254036903381,
+      "sampling/sampling_logp_difference/max": 4.322004318237305,
+      "sampling/sampling_logp_difference/mean": 0.019556276500225067,
+      "step": 16
+    },
+    {
+      "clip_ratio/high_max": 1.624216065465589e-05,
+      "clip_ratio/high_mean": 4.060540163663973e-06,
+      "clip_ratio/low_mean": 5.4349347919924185e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.840988796990132e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 14133.0,
+      "completions/max_terminated_length": 14133.0,
+      "completions/mean_length": 5343.25,
+      "completions/mean_terminated_length": 5343.25,
+      "completions/min_length": 324.0,
+      "completions/min_terminated_length": 324.0,
+      "entropy": 1.04741720110178,
+      "epoch": 0.015639374425023,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0035894038155674934,
+      "learning_rate": 1e-05,
+      "loss": 0.0584,
+      "num_tokens": 11987692.0,
+      "reward": 0.3359375,
+      "reward_std": 0.3124620020389557,
+      "rewards/accuracy_reward/mean": 0.3359375,
+      "rewards/accuracy_reward/std": 0.47417303919792175,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998996257781982,
+      "sampling/importance_sampling_ratio/min": 2.1446165192173794e-05,
+      "sampling/sampling_logp_difference/max": 10.749964714050293,
+      "sampling/sampling_logp_difference/mean": 0.020530637353658676,
+      "step": 17
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 4.272115029380075e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.272115029380075e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15138.0,
+      "completions/mean_length": 6301.9375,
+      "completions/mean_terminated_length": 5806.09814453125,
+      "completions/min_length": 72.0,
+      "completions/min_terminated_length": 72.0,
+      "entropy": 0.8892941772937775,
+      "epoch": 0.01655933762649494,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0032246762420982122,
+      "learning_rate": 1e-05,
+      "loss": 0.0811,
+      "num_tokens": 12814244.0,
+      "reward": 0.3125,
+      "reward_std": 0.3606000542640686,
+      "rewards/accuracy_reward/mean": 0.3125,
+      "rewards/accuracy_reward/std": 0.4653336703777313,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999184608459473,
+      "sampling/importance_sampling_ratio/min": 0.021351110190153122,
+      "sampling/sampling_logp_difference/max": 3.846651554107666,
+      "sampling/sampling_logp_difference/mean": 0.017541853711009026,
+      "step": 18
+    },
+    {
+      "clip_ratio/high_max": 9.956602298188955e-06,
+      "clip_ratio/high_mean": 2.4891505745472386e-06,
+      "clip_ratio/low_mean": 2.772165316855535e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.0210803743102588e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 16213.0,
+      "completions/max_terminated_length": 16213.0,
+      "completions/mean_length": 5297.46875,
+      "completions/mean_terminated_length": 5297.46875,
+      "completions/min_length": 2.0,
+      "completions/min_terminated_length": 2.0,
+      "entropy": 0.8097029253840446,
+      "epoch": 0.017479300827966882,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0023969109170138836,
+      "learning_rate": 1e-05,
+      "loss": -0.0153,
+      "num_tokens": 13512520.0,
+      "reward": 0.359375,
+      "reward_std": 0.248829185962677,
+      "rewards/accuracy_reward/mean": 0.359375,
+      "rewards/accuracy_reward/std": 0.481702595949173,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999222159385681,
+      "sampling/importance_sampling_ratio/min": 0.005766105372458696,
+      "sampling/sampling_logp_difference/max": 5.155758380889893,
+      "sampling/sampling_logp_difference/mean": 0.017464376986026764,
+      "step": 19
+    },
+    {
+      "clip_ratio/high_max": 1.0098337497765897e-05,
+      "clip_ratio/high_mean": 2.524584374441474e-06,
+      "clip_ratio/low_mean": 3.173396362399217e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.425854845318099e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14655.0,
+      "completions/mean_length": 4890.34375,
+      "completions/mean_terminated_length": 4799.84228515625,
+      "completions/min_length": 68.0,
+      "completions/min_terminated_length": 68.0,
+      "entropy": 0.9267145916819572,
+      "epoch": 0.01839926402943882,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.002759338356554508,
+      "learning_rate": 1e-05,
+      "loss": -0.0014,
+      "num_tokens": 14155556.0,
+      "reward": 0.3515625,
+      "reward_std": 0.31010788679122925,
+      "rewards/accuracy_reward/mean": 0.3515625,
+      "rewards/accuracy_reward/std": 0.4793342351913452,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999570250511169,
+      "sampling/importance_sampling_ratio/min": 0.008491010405123234,
+      "sampling/sampling_logp_difference/max": 4.768747329711914,
+      "sampling/sampling_logp_difference/mean": 0.018839433789253235,
+      "step": 20
+    },
+    {
+      "clip_ratio/high_max": 7.532389190600952e-06,
+      "clip_ratio/high_mean": 1.883097297650238e-06,
+      "clip_ratio/low_mean": 1.9051809317716106e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.0934906729053182e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 16296.0,
+      "completions/max_terminated_length": 16296.0,
+      "completions/mean_length": 4609.40625,
+      "completions/mean_terminated_length": 4609.40625,
+      "completions/min_length": 461.0,
+      "completions/min_terminated_length": 461.0,
+      "entropy": 1.171089917421341,
+      "epoch": 0.019319227230910764,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0021055075339972973,
+      "learning_rate": 1e-05,
+      "loss": -0.0051,
+      "num_tokens": 14765328.0,
+      "reward": 0.2421875,
+      "reward_std": 0.2409384548664093,
+      "rewards/accuracy_reward/mean": 0.2421875,
+      "rewards/accuracy_reward/std": 0.4300905168056488,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999741911888123,
+      "sampling/importance_sampling_ratio/min": 5.368983693188056e-07,
+      "sampling/sampling_logp_difference/max": 14.437457084655762,
+      "sampling/sampling_logp_difference/mean": 0.020226795226335526,
+      "step": 21
+    },
+    {
+      "clip_ratio/high_max": 1.7169573766295798e-05,
+      "clip_ratio/high_mean": 4.2923934415739495e-06,
+      "clip_ratio/low_mean": 5.869748633813288e-06,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 1.0162142189074075e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14299.0,
+      "completions/mean_length": 5099.0390625,
+      "completions/mean_terminated_length": 5010.18115234375,
+      "completions/min_length": 539.0,
+      "completions/min_terminated_length": 539.0,
+      "entropy": 1.005959376692772,
+      "epoch": 0.020239190432382703,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0027595218271017075,
+      "learning_rate": 1e-05,
+      "loss": 0.0236,
+      "num_tokens": 15438549.0,
+      "reward": 0.296875,
+      "reward_std": 0.20069602131843567,
+      "rewards/accuracy_reward/mean": 0.296875,
+      "rewards/accuracy_reward/std": 0.45867621898651123,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999887347221375,
+      "sampling/importance_sampling_ratio/min": 0.00013984869292471558,
+      "sampling/sampling_logp_difference/max": 8.87494945526123,
+      "sampling/sampling_logp_difference/mean": 0.01902824640274048,
+      "step": 22
+    },
+    {
+      "clip_ratio/high_max": 5.162942670722259e-06,
+      "clip_ratio/high_mean": 1.2907356676805648e-06,
+      "clip_ratio/low_mean": 3.6872071063953626e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.816280593582633e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16204.0,
+      "completions/mean_length": 7138.0390625,
+      "completions/mean_terminated_length": 6839.7822265625,
+      "completions/min_length": 729.0,
+      "completions/min_terminated_length": 729.0,
+      "entropy": 1.0403362140059471,
+      "epoch": 0.021159153633854646,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.002748022088780999,
+      "learning_rate": 1e-05,
+      "loss": 0.0647,
+      "num_tokens": 16373898.0,
+      "reward": 0.296875,
+      "reward_std": 0.3169426918029785,
+      "rewards/accuracy_reward/mean": 0.296875,
+      "rewards/accuracy_reward/std": 0.45867621898651123,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999048709869385,
+      "sampling/importance_sampling_ratio/min": 0.0003802926803473383,
+      "sampling/sampling_logp_difference/max": 7.874569416046143,
+      "sampling/sampling_logp_difference/mean": 0.020853528752923012,
+      "step": 23
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 5.6506045439164154e-05,
+      "clip_ratio/low_min": 5.709326615033206e-06,
+      "clip_ratio/region_mean": 5.6506045439164154e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14543.0,
+      "completions/mean_length": 5420.515625,
+      "completions/mean_terminated_length": 5334.18896484375,
+      "completions/min_length": 19.0,
+      "completions/min_terminated_length": 19.0,
+      "entropy": 1.1339883506298065,
+      "epoch": 0.02207911683532659,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0029502976685762405,
+      "learning_rate": 1e-05,
+      "loss": 0.0756,
+      "num_tokens": 17088156.0,
+      "reward": 0.1953125,
+      "reward_std": 0.25620076060295105,
+      "rewards/accuracy_reward/mean": 0.1953125,
+      "rewards/accuracy_reward/std": 0.3979988098144531,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999445676803589,
+      "sampling/importance_sampling_ratio/min": 9.70982582657598e-05,
+      "sampling/sampling_logp_difference/max": 9.239787101745605,
+      "sampling/sampling_logp_difference/mean": 0.0199423898011446,
+      "step": 24
+    },
+    {
+      "clip_ratio/high_max": 5.619998319161823e-06,
+      "clip_ratio/high_mean": 1.4049995797904558e-06,
+      "clip_ratio/low_mean": 6.439320418394345e-05,
+      "clip_ratio/low_min": 4.70632539872895e-06,
+      "clip_ratio/region_mean": 6.57982034226734e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14636.0,
+      "completions/mean_length": 5116.3046875,
+      "completions/mean_terminated_length": 4845.88037109375,
+      "completions/min_length": 10.0,
+      "completions/min_terminated_length": 10.0,
+      "entropy": 0.9503882825374603,
+      "epoch": 0.022999080036798528,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.004891107324510813,
+      "learning_rate": 1e-05,
+      "loss": 0.0522,
+      "num_tokens": 17766619.0,
+      "reward": 0.3203125,
+      "reward_std": 0.3366856575012207,
+      "rewards/accuracy_reward/mean": 0.3203125,
+      "rewards/accuracy_reward/std": 0.4684300124645233,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999970018863678,
+      "sampling/importance_sampling_ratio/min": 0.0010618992382660508,
+      "sampling/sampling_logp_difference/max": 6.847696304321289,
+      "sampling/sampling_logp_difference/mean": 0.01914183795452118,
+      "step": 25
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 3.839018643247982e-05,
+      "clip_ratio/low_min": 4.115091087442124e-06,
+      "clip_ratio/region_mean": 3.839018643247982e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14634.0,
+      "completions/mean_length": 5061.8671875,
+      "completions/mean_terminated_length": 4972.71630859375,
+      "completions/min_length": 281.0,
+      "completions/min_terminated_length": 281.0,
+      "entropy": 1.0540335327386856,
+      "epoch": 0.02391904323827047,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0030373274348676205,
+      "learning_rate": 1e-05,
+      "loss": 0.0246,
+      "num_tokens": 18432938.0,
+      "reward": 0.34375,
+      "reward_std": 0.28118088841438293,
+      "rewards/accuracy_reward/mean": 0.34375,
+      "rewards/accuracy_reward/std": 0.47682511806488037,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999624490737915,
+      "sampling/importance_sampling_ratio/min": 1.7212972807101323e-06,
+      "sampling/sampling_logp_difference/max": 13.272432327270508,
+      "sampling/sampling_logp_difference/mean": 0.019548218697309494,
+      "step": 26
+    },
+    {
+      "clip_ratio/high_max": 1.4656657867817557e-05,
+      "clip_ratio/high_mean": 4.665093399580655e-06,
+      "clip_ratio/low_mean": 3.751162262233265e-05,
+      "clip_ratio/low_min": 4.413062470121076e-06,
+      "clip_ratio/region_mean": 4.2176716192443564e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15782.0,
+      "completions/max_terminated_length": 15782.0,
+      "completions/mean_length": 6349.9765625,
+      "completions/mean_terminated_length": 6349.9765625,
+      "completions/min_length": 2.0,
+      "completions/min_terminated_length": 2.0,
+      "entropy": 1.0268081277608871,
+      "epoch": 0.02483900643974241,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0017623496241867542,
+      "learning_rate": 1e-05,
+      "loss": 0.0011,
+      "num_tokens": 19264743.0,
+      "reward": 0.2734375,
+      "reward_std": 0.33903974294662476,
+      "rewards/accuracy_reward/mean": 0.2734375,
+      "rewards/accuracy_reward/std": 0.447474867105484,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000061988830566,
+      "sampling/importance_sampling_ratio/min": 6.870362267363816e-05,
+      "sampling/sampling_logp_difference/max": 9.585708618164062,
+      "sampling/sampling_logp_difference/mean": 0.019106190651655197,
+      "step": 27
+    },
+    {
+      "clip_ratio/high_max": 9.221375876222737e-06,
+      "clip_ratio/high_mean": 2.3053439690556843e-06,
+      "clip_ratio/low_mean": 3.09787185415189e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.328406273794826e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15944.0,
+      "completions/mean_length": 5815.484375,
+      "completions/mean_terminated_length": 5561.84033203125,
+      "completions/min_length": 607.0,
+      "completions/min_terminated_length": 607.0,
+      "entropy": 1.0389493256807327,
+      "epoch": 0.025758969641214352,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.003111837198957801,
+      "learning_rate": 1e-05,
+      "loss": -0.0162,
+      "num_tokens": 20030109.0,
+      "reward": 0.34375,
+      "reward_std": 0.32719242572784424,
+      "rewards/accuracy_reward/mean": 0.34375,
+      "rewards/accuracy_reward/std": 0.47682511806488037,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000298023223877,
+      "sampling/importance_sampling_ratio/min": 0.02987043187022209,
+      "sampling/sampling_logp_difference/max": 3.5108861923217773,
+      "sampling/sampling_logp_difference/mean": 0.020060991868376732,
+      "step": 28
+    },
+    {
+      "clip_ratio/high_max": 6.7810142354574054e-06,
+      "clip_ratio/high_mean": 1.6952535588643514e-06,
+      "clip_ratio/low_mean": 4.474762545214617e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.644287901101052e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15371.0,
+      "completions/mean_length": 5157.1484375,
+      "completions/mean_terminated_length": 5068.748046875,
+      "completions/min_length": 11.0,
+      "completions/min_terminated_length": 11.0,
+      "entropy": 1.0510126948356628,
+      "epoch": 0.02667893284268629,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.003041633637621999,
+      "learning_rate": 1e-05,
+      "loss": 0.0471,
+      "num_tokens": 20710904.0,
+      "reward": 0.3125,
+      "reward_std": 0.35612428188323975,
+      "rewards/accuracy_reward/mean": 0.3125,
+      "rewards/accuracy_reward/std": 0.4653336703777313,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999587535858154,
+      "sampling/importance_sampling_ratio/min": 0.04357198625802994,
+      "sampling/sampling_logp_difference/max": 3.133340835571289,
+      "sampling/sampling_logp_difference/mean": 0.019007597118616104,
+      "step": 29
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 2.0962848566341563e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.0962848566341563e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15333.0,
+      "completions/max_terminated_length": 15333.0,
+      "completions/mean_length": 4446.3828125,
+      "completions/mean_terminated_length": 4446.3828125,
+      "completions/min_length": 73.0,
+      "completions/min_terminated_length": 73.0,
+      "entropy": 1.053279548883438,
+      "epoch": 0.027598896044158234,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0022369560319930315,
+      "learning_rate": 1e-05,
+      "loss": -0.001,
+      "num_tokens": 21298497.0,
+      "reward": 0.390625,
+      "reward_std": 0.24169495701789856,
+      "rewards/accuracy_reward/mean": 0.390625,
+      "rewards/accuracy_reward/std": 0.4898075461387634,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998750686645508,
+      "sampling/importance_sampling_ratio/min": 0.006704842206090689,
+      "sampling/sampling_logp_difference/max": 5.00492525100708,
+      "sampling/sampling_logp_difference/mean": 0.01947362720966339,
+      "step": 30
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 2.8460265411922592e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.8460265411922592e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15386.0,
+      "completions/mean_length": 6294.1484375,
+      "completions/mean_terminated_length": 6133.9921875,
+      "completions/min_length": 548.0,
+      "completions/min_terminated_length": 548.0,
+      "entropy": 1.2036212533712387,
+      "epoch": 0.028518859245630176,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0021383841522037983,
+      "learning_rate": 1e-05,
+      "loss": 0.033,
+      "num_tokens": 22124812.0,
+      "reward": 0.171875,
+      "reward_std": 0.20752590894699097,
+      "rewards/accuracy_reward/mean": 0.171875,
+      "rewards/accuracy_reward/std": 0.3787541687488556,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999858736991882,
+      "sampling/importance_sampling_ratio/min": 3.9575263599544996e-07,
+      "sampling/sampling_logp_difference/max": 14.742476463317871,
+      "sampling/sampling_logp_difference/mean": 0.022367021068930626,
+      "step": 31
+    },
+    {
+      "clip_ratio/high_max": 1.73864664247958e-05,
+      "clip_ratio/high_mean": 4.34661660619895e-06,
+      "clip_ratio/low_mean": 3.19569651310303e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.630358173722925e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14893.0,
+      "completions/mean_length": 6011.4921875,
+      "completions/mean_terminated_length": 5929.81884765625,
+      "completions/min_length": 2.0,
+      "completions/min_terminated_length": 2.0,
+      "entropy": 1.123318687081337,
+      "epoch": 0.029438822447102116,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.00126531848218292,
+      "learning_rate": 1e-05,
+      "loss": 0.0119,
+      "num_tokens": 22915091.0,
+      "reward": 0.171875,
+      "reward_std": 0.2330477386713028,
+      "rewards/accuracy_reward/mean": 0.171875,
+      "rewards/accuracy_reward/std": 0.3787541687488556,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999861121177673,
+      "sampling/importance_sampling_ratio/min": 1.6368276192224585e-05,
+      "sampling/sampling_logp_difference/max": 11.02016544342041,
+      "sampling/sampling_logp_difference/mean": 0.019905246794223785,
+      "step": 32
+    },
+    {
+      "clip_ratio/high_max": 2.8753217975463485e-05,
+      "clip_ratio/high_mean": 7.188304493865871e-06,
+      "clip_ratio/low_mean": 3.818478444372886e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.537308905128157e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16332.0,
+      "completions/mean_length": 5152.46875,
+      "completions/mean_terminated_length": 5064.03125,
+      "completions/min_length": 128.0,
+      "completions/min_terminated_length": 128.0,
+      "entropy": 1.0477670058608055,
+      "epoch": 0.03035878564857406,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0030069497879594564,
+      "learning_rate": 1e-05,
+      "loss": 0.1026,
+      "num_tokens": 23596487.0,
+      "reward": 0.3359375,
+      "reward_std": 0.29142576456069946,
+      "rewards/accuracy_reward/mean": 0.3359375,
+      "rewards/accuracy_reward/std": 0.47417303919792175,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999433755874634,
+      "sampling/importance_sampling_ratio/min": 9.009604013954231e-07,
+      "sampling/sampling_logp_difference/max": 13.919804573059082,
+      "sampling/sampling_logp_difference/mean": 0.019003981724381447,
+      "step": 33
+    },
+    {
+      "clip_ratio/high_max": 3.069575450354023e-05,
+      "clip_ratio/high_mean": 7.673938625885057e-06,
+      "clip_ratio/low_mean": 3.4847614415411954e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.252155258654966e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 12792.0,
+      "completions/max_terminated_length": 12792.0,
+      "completions/mean_length": 4672.5703125,
+      "completions/mean_terminated_length": 4672.5703125,
+      "completions/min_length": 4.0,
+      "completions/min_terminated_length": 4.0,
+      "entropy": 0.9471446052193642,
+      "epoch": 0.031278748850046,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.002676331205293536,
+      "learning_rate": 1e-05,
+      "loss": 0.0724,
+      "num_tokens": 24213408.0,
+      "reward": 0.3203125,
+      "reward_std": 0.2988021969795227,
+      "rewards/accuracy_reward/mean": 0.3203125,
+      "rewards/accuracy_reward/std": 0.4684300124645233,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000251531600952,
+      "sampling/importance_sampling_ratio/min": 0.0013351094676181674,
+      "sampling/sampling_logp_difference/max": 6.618741989135742,
+      "sampling/sampling_logp_difference/mean": 0.0179576613008976,
+      "step": 34
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 2.6127243245355203e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.6127243245355203e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16108.0,
+      "completions/mean_length": 7013.734375,
+      "completions/mean_terminated_length": 6711.4677734375,
+      "completions/min_length": 326.0,
+      "completions/min_terminated_length": 326.0,
+      "entropy": 1.1254516392946243,
+      "epoch": 0.03219871205151794,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0023615453392267227,
+      "learning_rate": 1e-05,
+      "loss": 0.0384,
+      "num_tokens": 25130262.0,
+      "reward": 0.1953125,
+      "reward_std": 0.26485776901245117,
+      "rewards/accuracy_reward/mean": 0.1953125,
+      "rewards/accuracy_reward/std": 0.3979988098144531,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999954342842102,
+      "sampling/importance_sampling_ratio/min": 6.6197676460433286e-06,
+      "sampling/sampling_logp_difference/max": 11.925450325012207,
+      "sampling/sampling_logp_difference/mean": 0.0215257927775383,
+      "step": 35
+    },
+    {
+      "clip_ratio/high_max": 4.06954040954588e-06,
+      "clip_ratio/high_mean": 1.01738510238647e-06,
+      "clip_ratio/low_mean": 4.180071573500754e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.281810015527299e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15673.0,
+      "completions/mean_length": 5858.59375,
+      "completions/mean_terminated_length": 5605.984375,
+      "completions/min_length": 189.0,
+      "completions/min_terminated_length": 189.0,
+      "entropy": 1.0713739022612572,
+      "epoch": 0.03311867525298988,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0029018481727689505,
+      "learning_rate": 1e-05,
+      "loss": 0.1041,
+      "num_tokens": 25898194.0,
+      "reward": 0.3671875,
+      "reward_std": 0.29036980867385864,
+      "rewards/accuracy_reward/mean": 0.3671875,
+      "rewards/accuracy_reward/std": 0.4839322865009308,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999915957450867,
+      "sampling/importance_sampling_ratio/min": 1.6834765119710937e-05,
+      "sampling/sampling_logp_difference/max": 10.992064476013184,
+      "sampling/sampling_logp_difference/mean": 0.019959844648838043,
+      "step": 36
+    },
+    {
+      "clip_ratio/high_max": 1.2810827229259303e-05,
+      "clip_ratio/high_mean": 3.2027068073148257e-06,
+      "clip_ratio/low_mean": 3.29701083501277e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.617281504375569e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14004.0,
+      "completions/mean_length": 6952.6015625,
+      "completions/mean_terminated_length": 6726.24853515625,
+      "completions/min_length": 3.0,
+      "completions/min_terminated_length": 3.0,
+      "entropy": 1.028619796037674,
+      "epoch": 0.03403863845446182,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0022342968732118607,
+      "learning_rate": 1e-05,
+      "loss": 0.0637,
+      "num_tokens": 26812791.0,
+      "reward": 0.234375,
+      "reward_std": 0.26827272772789,
+      "rewards/accuracy_reward/mean": 0.234375,
+      "rewards/accuracy_reward/std": 0.42527204751968384,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999532699584961,
+      "sampling/importance_sampling_ratio/min": 4.540153167909011e-05,
+      "sampling/sampling_logp_difference/max": 9.999964714050293,
+      "sampling/sampling_logp_difference/mean": 0.02002539485692978,
+      "step": 37
+    },
+    {
+      "clip_ratio/high_max": 1.5225089100567857e-05,
+      "clip_ratio/high_mean": 6.960676159906143e-06,
+      "clip_ratio/low_mean": 4.09088329433871e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.7869508762232726e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16361.0,
+      "completions/mean_length": 6413.421875,
+      "completions/mean_terminated_length": 6174.12841796875,
+      "completions/min_length": 2.0,
+      "completions/min_terminated_length": 2.0,
+      "entropy": 0.9452399462461472,
+      "epoch": 0.034958601655933765,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0021800603717565536,
+      "learning_rate": 1e-05,
+      "loss": 0.0275,
+      "num_tokens": 27652757.0,
+      "reward": 0.296875,
+      "reward_std": 0.31246688961982727,
+      "rewards/accuracy_reward/mean": 0.296875,
+      "rewards/accuracy_reward/std": 0.45867621898651123,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999439120292664,
+      "sampling/importance_sampling_ratio/min": 3.895394547726028e-05,
+      "sampling/sampling_logp_difference/max": 10.153130531311035,
+      "sampling/sampling_logp_difference/mean": 0.019722118973731995,
+      "step": 38
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 3.9564903318023426e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.9564903318023426e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15754.0,
+      "completions/max_terminated_length": 15754.0,
+      "completions/mean_length": 5176.3515625,
+      "completions/mean_terminated_length": 5176.3515625,
+      "completions/min_length": 130.0,
+      "completions/min_terminated_length": 130.0,
+      "entropy": 1.0444758981466293,
+      "epoch": 0.035878564857405704,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.004153470974415541,
+      "learning_rate": 1e-05,
+      "loss": 0.0798,
+      "num_tokens": 28334386.0,
+      "reward": 0.2734375,
+      "reward_std": 0.32035762071609497,
+      "rewards/accuracy_reward/mean": 0.2734375,
+      "rewards/accuracy_reward/std": 0.447474867105484,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999774694442749,
+      "sampling/importance_sampling_ratio/min": 0.007421077694743872,
+      "sampling/sampling_logp_difference/max": 4.903430938720703,
+      "sampling/sampling_logp_difference/mean": 0.020159056410193443,
+      "step": 39
+    },
+    {
+      "clip_ratio/high_max": 1.725743459246587e-05,
+      "clip_ratio/high_mean": 4.3143586481164675e-06,
+      "clip_ratio/low_mean": 2.0204584302518924e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.451894306432223e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15554.0,
+      "completions/mean_length": 5178.9921875,
+      "completions/mean_terminated_length": 5001.13525390625,
+      "completions/min_length": 4.0,
+      "completions/min_terminated_length": 4.0,
+      "entropy": 1.0803537145256996,
+      "epoch": 0.03679852805887764,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.002477057045325637,
+      "learning_rate": 1e-05,
+      "loss": 0.0067,
+      "num_tokens": 29017145.0,
+      "reward": 0.2890625,
+      "reward_std": 0.29932135343551636,
+      "rewards/accuracy_reward/mean": 0.2890625,
+      "rewards/accuracy_reward/std": 0.45510825514793396,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000497102737427,
+      "sampling/importance_sampling_ratio/min": 0.004630985204130411,
+      "sampling/sampling_logp_difference/max": 5.374985694885254,
+      "sampling/sampling_logp_difference/mean": 0.019826076924800873,
+      "step": 40
+    },
+    {
+      "clip_ratio/high_max": 1.6637992303003557e-05,
+      "clip_ratio/high_mean": 4.159498075750889e-06,
+      "clip_ratio/low_mean": 2.1970684144889674e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.6130182106953725e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 14131.0,
+      "completions/max_terminated_length": 14131.0,
+      "completions/mean_length": 4980.359375,
+      "completions/mean_terminated_length": 4980.359375,
+      "completions/min_length": 329.0,
+      "completions/min_terminated_length": 329.0,
+      "entropy": 0.9510642662644386,
+      "epoch": 0.03771849126034959,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0016275218222290277,
+      "learning_rate": 1e-05,
+      "loss": -0.0097,
+      "num_tokens": 29673535.0,
+      "reward": 0.4375,
+      "reward_std": 0.26249876618385315,
+      "rewards/accuracy_reward/mean": 0.4375,
+      "rewards/accuracy_reward/std": 0.49802759289741516,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999750852584839,
+      "sampling/importance_sampling_ratio/min": 0.000599516904912889,
+      "sampling/sampling_logp_difference/max": 7.419386386871338,
+      "sampling/sampling_logp_difference/mean": 0.01844976656138897,
+      "step": 41
+    },
+    {
+      "clip_ratio/high_max": 2.8087193186365766e-05,
+      "clip_ratio/high_mean": 7.021798296591442e-06,
+      "clip_ratio/low_mean": 3.9683913541921356e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.670571286169434e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15328.0,
+      "completions/mean_length": 5778.6953125,
+      "completions/mean_terminated_length": 5695.18896484375,
+      "completions/min_length": 691.0,
+      "completions/min_terminated_length": 691.0,
+      "entropy": 1.0413239300251007,
+      "epoch": 0.03863845446182153,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.001847646082751453,
+      "learning_rate": 1e-05,
+      "loss": -0.0045,
+      "num_tokens": 30436416.0,
+      "reward": 0.2578125,
+      "reward_std": 0.33903977274894714,
+      "rewards/accuracy_reward/mean": 0.2578125,
+      "rewards/accuracy_reward/std": 0.43914902210235596,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998501539230347,
+      "sampling/importance_sampling_ratio/min": 0.00020348970429040492,
+      "sampling/sampling_logp_difference/max": 8.499895095825195,
+      "sampling/sampling_logp_difference/mean": 0.021502099931240082,
+      "step": 42
+    },
+    {
+      "clip_ratio/high_max": 2.68402091023745e-05,
+      "clip_ratio/high_mean": 8.575278570788214e-06,
+      "clip_ratio/low_mean": 4.547183698377921e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.404711600931478e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 14182.0,
+      "completions/max_terminated_length": 14182.0,
+      "completions/mean_length": 4875.125,
+      "completions/mean_terminated_length": 4875.125,
+      "completions/min_length": 349.0,
+      "completions/min_terminated_length": 349.0,
+      "entropy": 1.0464690178632736,
+      "epoch": 0.03955841766329347,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0021134833805263042,
+      "learning_rate": 1e-05,
+      "loss": 0.0727,
+      "num_tokens": 31083672.0,
+      "reward": 0.40625,
+      "reward_std": 0.3584783971309662,
+      "rewards/accuracy_reward/mean": 0.40625,
+      "rewards/accuracy_reward/std": 0.4930621087551117,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999340176582336,
+      "sampling/importance_sampling_ratio/min": 0.012113225646317005,
+      "sampling/sampling_logp_difference/max": 4.41345739364624,
+      "sampling/sampling_logp_difference/mean": 0.019140049815177917,
+      "step": 43
+    },
+    {
+      "clip_ratio/high_max": 3.9877967992651975e-05,
+      "clip_ratio/high_mean": 9.969491998162994e-06,
+      "clip_ratio/low_mean": 3.981287841270387e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.9782369273998484e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15959.0,
+      "completions/mean_length": 4691.421875,
+      "completions/mean_terminated_length": 4505.82568359375,
+      "completions/min_length": 296.0,
+      "completions/min_terminated_length": 296.0,
+      "entropy": 1.0229775309562683,
+      "epoch": 0.040478380864765406,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0037735572550445795,
+      "learning_rate": 1e-05,
+      "loss": 0.0603,
+      "num_tokens": 31703654.0,
+      "reward": 0.4453125,
+      "reward_std": 0.2993389964103699,
+      "rewards/accuracy_reward/mean": 0.4453125,
+      "rewards/accuracy_reward/std": 0.4989531338214874,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999492168426514,
+      "sampling/importance_sampling_ratio/min": 0.03150063753128052,
+      "sampling/sampling_logp_difference/max": 3.457747459411621,
+      "sampling/sampling_logp_difference/mean": 0.01912039890885353,
+      "step": 44
+    },
+    {
+      "clip_ratio/high_max": 3.5441889849607833e-06,
+      "clip_ratio/high_mean": 8.860472462401958e-07,
+      "clip_ratio/low_mean": 1.5137359810069029e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 1.6023407056309225e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15605.0,
+      "completions/mean_length": 6821.96875,
+      "completions/mean_terminated_length": 6592.48046875,
+      "completions/min_length": 1196.0,
+      "completions/min_terminated_length": 1196.0,
+      "entropy": 1.1132484003901482,
+      "epoch": 0.04139834406623735,
+      "frac_reward_zero_std": 0.625,
+      "grad_norm": 0.0010448681423440576,
+      "learning_rate": 1e-05,
+      "loss": 0.022,
+      "num_tokens": 32599778.0,
+      "reward": 0.2265625,
+      "reward_std": 0.1814819872379303,
+      "rewards/accuracy_reward/mean": 0.2265625,
+      "rewards/accuracy_reward/std": 0.4202519655227661,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999915361404419,
+      "sampling/importance_sampling_ratio/min": 0.006500681862235069,
+      "sampling/sampling_logp_difference/max": 5.035848140716553,
+      "sampling/sampling_logp_difference/mean": 0.02125459350645542,
+      "step": 45
+    },
+    {
+      "clip_ratio/high_max": 4.652893949241843e-06,
+      "clip_ratio/high_mean": 1.1632234873104608e-06,
+      "clip_ratio/low_mean": 5.731516603191267e-05,
+      "clip_ratio/low_min": 9.891066838463303e-06,
+      "clip_ratio/region_mean": 5.8478389746596804e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15753.0,
+      "completions/mean_length": 6834.3671875,
+      "completions/mean_terminated_length": 6605.17626953125,
+      "completions/min_length": 624.0,
+      "completions/min_terminated_length": 624.0,
+      "entropy": 0.9827468693256378,
+      "epoch": 0.04231830726770929,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.0017670176457613707,
+      "learning_rate": 1e-05,
+      "loss": 0.1105,
+      "num_tokens": 33492737.0,
+      "reward": 0.3046875,
+      "reward_std": 0.3440523147583008,
+      "rewards/accuracy_reward/mean": 0.3046875,
+      "rewards/accuracy_reward/std": 0.46208351850509644,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999089241027832,
+      "sampling/importance_sampling_ratio/min": 0.0021202093921601772,
+      "sampling/sampling_logp_difference/max": 6.156240463256836,
+      "sampling/sampling_logp_difference/mean": 0.019490526989102364,
+      "step": 46
+    },
+    {
+      "clip_ratio/high_max": 6.717360520269722e-06,
+      "clip_ratio/high_mean": 2.503530367903295e-06,
+      "clip_ratio/low_mean": 2.5672919832686603e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.8176450200589898e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14098.0,
+      "completions/mean_length": 6175.296875,
+      "completions/mean_terminated_length": 5845.98388671875,
+      "completions/min_length": 558.0,
+      "completions/min_terminated_length": 558.0,
+      "entropy": 1.1584237962961197,
+      "epoch": 0.04323827046918123,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.0016891945851966739,
+      "learning_rate": 1e-05,
+      "loss": -0.0008,
+      "num_tokens": 34312455.0,
+      "reward": 0.1875,
+      "reward_std": 0.19673937559127808,
+      "rewards/accuracy_reward/mean": 0.1875,
+      "rewards/accuracy_reward/std": 0.39184603095054626,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999643564224243,
+      "sampling/importance_sampling_ratio/min": 8.086384332273155e-05,
+      "sampling/sampling_logp_difference/max": 9.422743797302246,
+      "sampling/sampling_logp_difference/mean": 0.021749887615442276,
+      "step": 47
+    },
+    {
+      "clip_ratio/high_max": 2.2362002255249536e-05,
+      "clip_ratio/high_mean": 8.189798336388776e-06,
+      "clip_ratio/low_mean": 2.1058204993096297e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.9248002192616696e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16054.0,
+      "completions/mean_length": 6036.8359375,
+      "completions/mean_terminated_length": 5955.3623046875,
+      "completions/min_length": 510.0,
+      "completions/min_terminated_length": 510.0,
+      "entropy": 0.9301538467407227,
+      "epoch": 0.04415823367065318,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.003834392176941037,
+      "learning_rate": 1e-05,
+      "loss": 0.0636,
+      "num_tokens": 35102738.0,
+      "reward": 0.4375,
+      "reward_std": 0.36614155769348145,
+      "rewards/accuracy_reward/mean": 0.4375,
+      "rewards/accuracy_reward/std": 0.49802759289741516,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998494386672974,
+      "sampling/importance_sampling_ratio/min": 0.00013992394087836146,
+      "sampling/sampling_logp_difference/max": 8.874411582946777,
+      "sampling/sampling_logp_difference/mean": 0.019147861748933792,
+      "step": 48
+    },
+    {
+      "clip_ratio/high_max": 1.1501961580506759e-05,
+      "clip_ratio/high_mean": 2.8754903951266897e-06,
+      "clip_ratio/low_mean": 4.08189714562468e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.369446196506033e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15594.0,
+      "completions/mean_length": 6262.46875,
+      "completions/mean_terminated_length": 5764.68798828125,
+      "completions/min_length": 210.0,
+      "completions/min_terminated_length": 210.0,
+      "entropy": 0.8599015846848488,
+      "epoch": 0.045078196872125116,
+      "frac_reward_zero_std": 0.125,
+      "grad_norm": 0.0029804729856550694,
+      "learning_rate": 1e-05,
+      "loss": 0.0495,
+      "num_tokens": 35924886.0,
+      "reward": 0.3984375,
+      "reward_std": 0.3911295533180237,
+      "rewards/accuracy_reward/mean": 0.3984375,
+      "rewards/accuracy_reward/std": 0.4915000796318054,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999922513961792,
+      "sampling/importance_sampling_ratio/min": 0.00021375219512265176,
+      "sampling/sampling_logp_difference/max": 9.904524803161621,
+      "sampling/sampling_logp_difference/mean": 0.01815103553235531,
+      "step": 49
+    },
+    {
+      "clip_ratio/high_max": 2.4107544049911667e-05,
+      "clip_ratio/high_mean": 6.026886012477917e-06,
+      "clip_ratio/low_mean": 3.6588148361715866e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.261503391944643e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 14556.0,
+      "completions/max_terminated_length": 14556.0,
+      "completions/mean_length": 5926.8984375,
+      "completions/mean_terminated_length": 5926.8984375,
+      "completions/min_length": 346.0,
+      "completions/min_terminated_length": 346.0,
+      "entropy": 1.0042993426322937,
+      "epoch": 0.045998160073597055,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0022071697749197483,
+      "learning_rate": 1e-05,
+      "loss": 0.0059,
+      "num_tokens": 36700913.0,
+      "reward": 0.3359375,
+      "reward_std": 0.3306073546409607,
+      "rewards/accuracy_reward/mean": 0.3359375,
+      "rewards/accuracy_reward/std": 0.47417303919792175,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.000010371208191,
+      "sampling/importance_sampling_ratio/min": 0.0005220364546403289,
+      "sampling/sampling_logp_difference/max": 7.557773113250732,
+      "sampling/sampling_logp_difference/mean": 0.01954064890742302,
+      "step": 50
+    },
+    {
+      "clip_ratio/high_max": 4.9106265578302555e-06,
+      "clip_ratio/high_mean": 1.2276566394575639e-06,
+      "clip_ratio/low_mean": 2.634599570683349e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.7573652346291055e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15217.0,
+      "completions/mean_length": 6873.6875,
+      "completions/mean_terminated_length": 6645.4404296875,
+      "completions/min_length": 505.0,
+      "completions/min_terminated_length": 505.0,
+      "entropy": 1.0255412608385086,
+      "epoch": 0.046918123275068994,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.002320924773812294,
+      "learning_rate": 1e-05,
+      "loss": 0.0508,
+      "num_tokens": 37604865.0,
+      "reward": 0.234375,
+      "reward_std": 0.3135228157043457,
+      "rewards/accuracy_reward/mean": 0.234375,
+      "rewards/accuracy_reward/std": 0.42527204751968384,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999098777770996,
+      "sampling/importance_sampling_ratio/min": 0.026153141632676125,
+      "sampling/sampling_logp_difference/max": 3.6437859535217285,
+      "sampling/sampling_logp_difference/mean": 0.019532475620508194,
+      "step": 51
+    },
+    {
+      "clip_ratio/high_max": 1.6350510122720152e-05,
+      "clip_ratio/high_mean": 4.087627530680038e-06,
+      "clip_ratio/low_mean": 2.351988746340794e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.7607515221461654e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15668.0,
+      "completions/mean_length": 6073.8984375,
+      "completions/mean_terminated_length": 5992.71630859375,
+      "completions/min_length": 654.0,
+      "completions/min_terminated_length": 654.0,
+      "entropy": 1.0713753998279572,
+      "epoch": 0.04783808647654094,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.002212709980085492,
+      "learning_rate": 1e-05,
+      "loss": 0.0668,
+      "num_tokens": 38405196.0,
+      "reward": 0.359375,
+      "reward_std": 0.22119548916816711,
+      "rewards/accuracy_reward/mean": 0.359375,
+      "rewards/accuracy_reward/std": 0.481702595949173,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998978972434998,
+      "sampling/importance_sampling_ratio/min": 8.706459084351081e-06,
+      "sampling/sampling_logp_difference/max": 11.651445388793945,
+      "sampling/sampling_logp_difference/mean": 0.021252838894724846,
+      "step": 52
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 3.729486718384578e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.729486718384578e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15299.0,
+      "completions/mean_length": 5838.71875,
+      "completions/mean_terminated_length": 5671.33349609375,
+      "completions/min_length": 331.0,
+      "completions/min_terminated_length": 331.0,
+      "entropy": 1.021155133843422,
+      "epoch": 0.04875804967801288,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.001135052996687591,
+      "learning_rate": 1e-05,
+      "loss": 0.0178,
+      "num_tokens": 39171704.0,
+      "reward": 0.28125,
+      "reward_std": 0.23410367965698242,
+      "rewards/accuracy_reward/mean": 0.28125,
+      "rewards/accuracy_reward/std": 0.4513758420944214,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999173879623413,
+      "sampling/importance_sampling_ratio/min": 0.003084881929680705,
+      "sampling/sampling_logp_difference/max": 5.7812418937683105,
+      "sampling/sampling_logp_difference/mean": 0.020781882107257843,
+      "step": 53
+    },
+    {
+      "clip_ratio/high_max": 1.7124169744420215e-05,
+      "clip_ratio/high_mean": 4.281042436105054e-06,
+      "clip_ratio/low_mean": 3.706903294187214e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.135007543482061e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 14617.0,
+      "completions/max_terminated_length": 14617.0,
+      "completions/mean_length": 6358.5859375,
+      "completions/mean_terminated_length": 6358.5859375,
+      "completions/min_length": 940.0,
+      "completions/min_terminated_length": 940.0,
+      "entropy": 0.9720487147569656,
+      "epoch": 0.04967801287948482,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.002638082252815366,
+      "learning_rate": 1e-05,
+      "loss": 0.0145,
+      "num_tokens": 40003859.0,
+      "reward": 0.40625,
+      "reward_std": 0.3174618184566498,
+      "rewards/accuracy_reward/mean": 0.40625,
+      "rewards/accuracy_reward/std": 0.4930621087551117,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000380277633667,
+      "sampling/importance_sampling_ratio/min": 0.01960253342986107,
+      "sampling/sampling_logp_difference/max": 3.932096481323242,
+      "sampling/sampling_logp_difference/mean": 0.01991666667163372,
+      "step": 54
+    },
+    {
+      "clip_ratio/high_max": 6.55582925901399e-06,
+      "clip_ratio/high_mean": 2.994117721755174e-06,
+      "clip_ratio/low_mean": 2.222621503733535e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.5220332759090525e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 14753.0,
+      "completions/max_terminated_length": 14753.0,
+      "completions/mean_length": 4634.1875,
+      "completions/mean_terminated_length": 4634.1875,
+      "completions/min_length": 266.0,
+      "completions/min_terminated_length": 266.0,
+      "entropy": 0.9715309366583824,
+      "epoch": 0.050597976080956765,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.001994960242882371,
+      "learning_rate": 1e-05,
+      "loss": 0.0221,
+      "num_tokens": 40616483.0,
+      "reward": 0.4375,
+      "reward_std": 0.29644322395324707,
+      "rewards/accuracy_reward/mean": 0.4375,
+      "rewards/accuracy_reward/std": 0.49802759289741516,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000698566436768,
+      "sampling/importance_sampling_ratio/min": 1.0510009815334342e-05,
+      "sampling/sampling_logp_difference/max": 11.46318244934082,
+      "sampling/sampling_logp_difference/mean": 0.01902047172188759,
+      "step": 55
+    },
+    {
+      "clip_ratio/high_max": 2.2474248908110894e-05,
+      "clip_ratio/high_mean": 7.571314540655294e-06,
+      "clip_ratio/low_mean": 4.3583780325207044e-05,
+      "clip_ratio/low_min": 4.6013396968191955e-06,
+      "clip_ratio/region_mean": 5.1155094070054474e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15953.0,
+      "completions/mean_length": 6596.25,
+      "completions/mean_terminated_length": 6361.34423828125,
+      "completions/min_length": 318.0,
+      "completions/min_terminated_length": 318.0,
+      "entropy": 0.8207943215966225,
+      "epoch": 0.051517939282428704,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0019902780186384916,
+      "learning_rate": 1e-05,
+      "loss": 0.0506,
+      "num_tokens": 41484443.0,
+      "reward": 0.4453125,
+      "reward_std": 0.326668381690979,
+      "rewards/accuracy_reward/mean": 0.4453125,
+      "rewards/accuracy_reward/std": 0.4989531338214874,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.000016689300537,
+      "sampling/importance_sampling_ratio/min": 7.485233072657138e-05,
+      "sampling/sampling_logp_difference/max": 9.499993324279785,
+      "sampling/sampling_logp_difference/mean": 0.018301833420991898,
+      "step": 56
+    },
+    {
+      "clip_ratio/high_max": 3.0019932637515012e-06,
+      "clip_ratio/high_mean": 7.504983159378753e-07,
+      "clip_ratio/low_mean": 4.332785601945943e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.407835376696312e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15834.0,
+      "completions/mean_length": 6785.75,
+      "completions/mean_terminated_length": 6313.70458984375,
+      "completions/min_length": 393.0,
+      "completions/min_terminated_length": 393.0,
+      "entropy": 0.9876058474183083,
+      "epoch": 0.05243790248390064,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0015235114842653275,
+      "learning_rate": 1e-05,
+      "loss": 0.0128,
+      "num_tokens": 42372235.0,
+      "reward": 0.2421875,
+      "reward_std": 0.325075626373291,
+      "rewards/accuracy_reward/mean": 0.2421875,
+      "rewards/accuracy_reward/std": 0.4300905168056488,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999551773071289,
+      "sampling/importance_sampling_ratio/min": 0.026679370552301407,
+      "sampling/sampling_logp_difference/max": 3.6238646507263184,
+      "sampling/sampling_logp_difference/mean": 0.019945615902543068,
+      "step": 57
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 2.1349006601667497e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.1349006601667497e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14726.0,
+      "completions/mean_length": 4881.2109375,
+      "completions/mean_terminated_length": 4510.1533203125,
+      "completions/min_length": 437.0,
+      "completions/min_terminated_length": 437.0,
+      "entropy": 0.989942155778408,
+      "epoch": 0.05335786568537258,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.002033712575212121,
+      "learning_rate": 1e-05,
+      "loss": 0.1088,
+      "num_tokens": 43015238.0,
+      "reward": 0.4375,
+      "reward_std": 0.2869548797607422,
+      "rewards/accuracy_reward/mean": 0.4375,
+      "rewards/accuracy_reward/std": 0.49802759289741516,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000300407409668,
+      "sampling/importance_sampling_ratio/min": 0.0001238943514181301,
+      "sampling/sampling_logp_difference/max": 8.996081352233887,
+      "sampling/sampling_logp_difference/mean": 0.01887543685734272,
+      "step": 58
+    },
+    {
+      "clip_ratio/high_max": 2.584004687378183e-05,
+      "clip_ratio/high_mean": 6.4600117184454575e-06,
+      "clip_ratio/low_mean": 2.1371045761497953e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.7831058105221018e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15001.0,
+      "completions/max_terminated_length": 15001.0,
+      "completions/mean_length": 4725.3984375,
+      "completions/mean_terminated_length": 4725.3984375,
+      "completions/min_length": 117.0,
+      "completions/min_terminated_length": 117.0,
+      "entropy": 1.0350637435913086,
+      "epoch": 0.05427782888684453,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0030296226032078266,
+      "learning_rate": 1e-05,
+      "loss": 0.0691,
+      "num_tokens": 43637737.0,
+      "reward": 0.4453125,
+      "reward_std": 0.32035762071609497,
+      "rewards/accuracy_reward/mean": 0.4453125,
+      "rewards/accuracy_reward/std": 0.4989531338214874,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999939203262329,
+      "sampling/importance_sampling_ratio/min": 0.00022932067804504186,
+      "sampling/sampling_logp_difference/max": 8.380389213562012,
+      "sampling/sampling_logp_difference/mean": 0.01995944231748581,
+      "step": 59
+    },
+    {
+      "clip_ratio/high_max": 1.994733975152485e-05,
+      "clip_ratio/high_mean": 4.986834937881213e-06,
+      "clip_ratio/low_mean": 3.5168303838872816e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.015513832200668e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16240.0,
+      "completions/mean_length": 4918.171875,
+      "completions/mean_terminated_length": 4736.1748046875,
+      "completions/min_length": 388.0,
+      "completions/min_terminated_length": 388.0,
+      "entropy": 0.965274304151535,
+      "epoch": 0.05519779208831647,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.002758471528068185,
+      "learning_rate": 1e-05,
+      "loss": 0.0845,
+      "num_tokens": 44285327.0,
+      "reward": 0.328125,
+      "reward_std": 0.27328526973724365,
+      "rewards/accuracy_reward/mean": 0.328125,
+      "rewards/accuracy_reward/std": 0.4713755249977112,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999663233757019,
+      "sampling/importance_sampling_ratio/min": 0.010958661325275898,
+      "sampling/sampling_logp_difference/max": 4.513625144958496,
+      "sampling/sampling_logp_difference/mean": 0.019083233550190926,
+      "step": 60
+    },
+    {
+      "clip_ratio/high_max": 1.0621563887980301e-05,
+      "clip_ratio/high_mean": 2.6553909719950752e-06,
+      "clip_ratio/low_mean": 3.838553107016196e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.1040922042157035e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15031.0,
+      "completions/mean_length": 4998.2890625,
+      "completions/mean_terminated_length": 4908.6376953125,
+      "completions/min_length": 524.0,
+      "completions/min_terminated_length": 524.0,
+      "entropy": 0.9200445115566254,
+      "epoch": 0.05611775528978841,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.0027611786499619484,
+      "learning_rate": 1e-05,
+      "loss": 0.0575,
+      "num_tokens": 44944356.0,
+      "reward": 0.3515625,
+      "reward_std": 0.3895368278026581,
+      "rewards/accuracy_reward/mean": 0.3515625,
+      "rewards/accuracy_reward/std": 0.4793342351913452,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999884366989136,
+      "sampling/importance_sampling_ratio/min": 0.0018651526188477874,
+      "sampling/sampling_logp_difference/max": 6.284412384033203,
+      "sampling/sampling_logp_difference/mean": 0.017853498458862305,
+      "step": 61
+    },
+    {
+      "clip_ratio/high_max": 1.0136624496226432e-05,
+      "clip_ratio/high_mean": 2.534156124056608e-06,
+      "clip_ratio/low_mean": 2.0260404085092887e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.2794560095462657e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16110.0,
+      "completions/mean_length": 6290.1796875,
+      "completions/mean_terminated_length": 6129.96044921875,
+      "completions/min_length": 302.0,
+      "completions/min_terminated_length": 302.0,
+      "entropy": 0.9360214695334435,
+      "epoch": 0.05703771849126035,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0015557854203507304,
+      "learning_rate": 1e-05,
+      "loss": 0.0111,
+      "num_tokens": 45767867.0,
+      "reward": 0.34375,
+      "reward_std": 0.30168038606643677,
+      "rewards/accuracy_reward/mean": 0.34375,
+      "rewards/accuracy_reward/std": 0.47682511806488037,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999427795410156,
+      "sampling/importance_sampling_ratio/min": 0.0011004531988874078,
+      "sampling/sampling_logp_difference/max": 6.812033176422119,
+      "sampling/sampling_logp_difference/mean": 0.0200855303555727,
+      "step": 62
+    },
+    {
+      "clip_ratio/high_max": 2.2559511307918e-06,
+      "clip_ratio/high_mean": 5.6398778269795e-07,
+      "clip_ratio/low_mean": 4.51761221711422e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.574010984015331e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16366.0,
+      "completions/mean_length": 6486.15625,
+      "completions/mean_terminated_length": 6248.6083984375,
+      "completions/min_length": 77.0,
+      "completions/min_terminated_length": 77.0,
+      "entropy": 0.863138921558857,
+      "epoch": 0.05795768169273229,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0026953541673719883,
+      "learning_rate": 1e-05,
+      "loss": -0.0194,
+      "num_tokens": 46618575.0,
+      "reward": 0.2578125,
+      "reward_std": 0.2580180764198303,
+      "rewards/accuracy_reward/mean": 0.2578125,
+      "rewards/accuracy_reward/std": 0.43914902210235596,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999406337738037,
+      "sampling/importance_sampling_ratio/min": 0.0011708897072821856,
+      "sampling/sampling_logp_difference/max": 6.749991416931152,
+      "sampling/sampling_logp_difference/mean": 0.01863238587975502,
+      "step": 63
+    },
+    {
+      "clip_ratio/high_max": 1.0073357771034352e-05,
+      "clip_ratio/high_mean": 2.518339442758588e-06,
+      "clip_ratio/low_mean": 2.787370635815023e-05,
+      "clip_ratio/low_min": 3.837534222839167e-06,
+      "clip_ratio/region_mean": 3.0392045573535142e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16010.0,
+      "completions/mean_length": 6442.7734375,
+      "completions/mean_terminated_length": 6284.9765625,
+      "completions/min_length": 776.0,
+      "completions/min_terminated_length": 776.0,
+      "entropy": 1.0242054909467697,
+      "epoch": 0.05887764489420423,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0024442619178444147,
+      "learning_rate": 1e-05,
+      "loss": 0.0569,
+      "num_tokens": 47462274.0,
+      "reward": 0.328125,
+      "reward_std": 0.2777610421180725,
+      "rewards/accuracy_reward/mean": 0.328125,
+      "rewards/accuracy_reward/std": 0.4713755249977112,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998892545700073,
+      "sampling/importance_sampling_ratio/min": 4.9445447736218284e-09,
+      "sampling/sampling_logp_difference/max": 19.124980926513672,
+      "sampling/sampling_logp_difference/mean": 0.019810764119029045,
+      "step": 64
+    },
+    {
+      "clip_ratio/high_max": 1.220810372615233e-05,
+      "clip_ratio/high_mean": 3.0520259315380827e-06,
+      "clip_ratio/low_mean": 4.339240456374682e-05,
+      "clip_ratio/low_min": 4.491233084991109e-06,
+      "clip_ratio/region_mean": 4.644443038159807e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15624.0,
+      "completions/mean_length": 4807.765625,
+      "completions/mean_terminated_length": 4716.6142578125,
+      "completions/min_length": 272.0,
+      "completions/min_terminated_length": 272.0,
+      "entropy": 1.045751042664051,
+      "epoch": 0.05979760809567617,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.002512057079002261,
+      "learning_rate": 1e-05,
+      "loss": 0.003,
+      "num_tokens": 48096692.0,
+      "reward": 0.3671875,
+      "reward_std": 0.3435155153274536,
+      "rewards/accuracy_reward/mean": 0.3671875,
+      "rewards/accuracy_reward/std": 0.4839322865009308,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999058842658997,
+      "sampling/importance_sampling_ratio/min": 1.1480136890895665e-05,
+      "sampling/sampling_logp_difference/max": 11.374892234802246,
+      "sampling/sampling_logp_difference/mean": 0.01960371434688568,
+      "step": 65
+    },
+    {
+      "clip_ratio/high_max": 5.37941218681226e-06,
+      "clip_ratio/high_mean": 1.344853046703065e-06,
+      "clip_ratio/low_mean": 3.0161771633174794e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.1506624850408116e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16264.0,
+      "completions/mean_length": 6703.8359375,
+      "completions/mean_terminated_length": 6471.51220703125,
+      "completions/min_length": 813.0,
+      "completions/min_terminated_length": 813.0,
+      "entropy": 1.0592866837978363,
+      "epoch": 0.06071757129714812,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0016389708034694195,
+      "learning_rate": 1e-05,
+      "loss": -0.024,
+      "num_tokens": 48974399.0,
+      "reward": 0.2734375,
+      "reward_std": 0.2585548758506775,
+      "rewards/accuracy_reward/mean": 0.2734375,
+      "rewards/accuracy_reward/std": 0.447474867105484,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999353885650635,
+      "sampling/importance_sampling_ratio/min": 7.4113349910476245e-06,
+      "sampling/sampling_logp_difference/max": 11.8125,
+      "sampling/sampling_logp_difference/mean": 0.020880095660686493,
+      "step": 66
+    },
+    {
+      "clip_ratio/high_max": 7.093600515872822e-06,
+      "clip_ratio/high_mean": 1.7734001289682055e-06,
+      "clip_ratio/low_mean": 4.470584758564655e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.647924811251869e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16295.0,
+      "completions/mean_length": 6140.5078125,
+      "completions/mean_terminated_length": 5724.10546875,
+      "completions/min_length": 462.0,
+      "completions/min_terminated_length": 462.0,
+      "entropy": 1.0998501181602478,
+      "epoch": 0.061637534498620056,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.003946912474930286,
+      "learning_rate": 1e-05,
+      "loss": 0.0448,
+      "num_tokens": 49779920.0,
+      "reward": 0.34375,
+      "reward_std": 0.36796674132347107,
+      "rewards/accuracy_reward/mean": 0.34375,
+      "rewards/accuracy_reward/std": 0.47682511806488037,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999687671661377,
+      "sampling/importance_sampling_ratio/min": 2.849436668839189e-07,
+      "sampling/sampling_logp_difference/max": 15.070974349975586,
+      "sampling/sampling_logp_difference/mean": 0.021355850622057915,
+      "step": 67
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 3.313956779038563e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.313956779038563e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16352.0,
+      "completions/mean_length": 6689.8046875,
+      "completions/mean_terminated_length": 6213.04052734375,
+      "completions/min_length": 422.0,
+      "completions/min_terminated_length": 422.0,
+      "entropy": 0.8561654165387154,
+      "epoch": 0.062557497700092,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.0021656695753335953,
+      "learning_rate": 1e-05,
+      "loss": 0.0283,
+      "num_tokens": 50655023.0,
+      "reward": 0.203125,
+      "reward_std": 0.21723884344100952,
+      "rewards/accuracy_reward/mean": 0.203125,
+      "rewards/accuracy_reward/std": 0.40390563011169434,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999941885471344,
+      "sampling/importance_sampling_ratio/min": 2.836359499269747e-06,
+      "sampling/sampling_logp_difference/max": 12.772989273071289,
+      "sampling/sampling_logp_difference/mean": 0.01873670145869255,
+      "step": 68
+    },
+    {
+      "clip_ratio/high_max": 2.3421607693308033e-05,
+      "clip_ratio/high_mean": 7.242933975248889e-06,
+      "clip_ratio/low_mean": 3.896083626386826e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.620377103492501e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 14330.0,
+      "completions/max_terminated_length": 14330.0,
+      "completions/mean_length": 5707.0078125,
+      "completions/mean_terminated_length": 5707.0078125,
+      "completions/min_length": 625.0,
+      "completions/min_terminated_length": 625.0,
+      "entropy": 1.1396166533231735,
+      "epoch": 0.06347746090156393,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.004121148493140936,
+      "learning_rate": 1e-05,
+      "loss": 0.0397,
+      "num_tokens": 51406536.0,
+      "reward": 0.3125,
+      "reward_std": 0.3066929578781128,
+      "rewards/accuracy_reward/mean": 0.3125,
+      "rewards/accuracy_reward/std": 0.4653336703777313,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999328851699829,
+      "sampling/importance_sampling_ratio/min": 0.0005196487763896585,
+      "sampling/sampling_logp_difference/max": 7.562357425689697,
+      "sampling/sampling_logp_difference/mean": 0.020000409334897995,
+      "step": 69
+    },
+    {
+      "clip_ratio/high_max": 1.82290532393381e-05,
+      "clip_ratio/high_mean": 4.557263309834525e-06,
+      "clip_ratio/low_mean": 2.5275351731579576e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.9832615496161452e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15987.0,
+      "completions/mean_length": 5655.6328125,
+      "completions/mean_terminated_length": 5571.1572265625,
+      "completions/min_length": 157.0,
+      "completions/min_terminated_length": 157.0,
+      "entropy": 0.8928132206201553,
+      "epoch": 0.06439742410303588,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0032538517843931913,
+      "learning_rate": 1e-05,
+      "loss": 0.0627,
+      "num_tokens": 52148473.0,
+      "reward": 0.3984375,
+      "reward_std": 0.29432642459869385,
+      "rewards/accuracy_reward/mean": 0.3984375,
+      "rewards/accuracy_reward/std": 0.4915000796318054,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000033378601074,
+      "sampling/importance_sampling_ratio/min": 0.0017573959194123745,
+      "sampling/sampling_logp_difference/max": 6.343922138214111,
+      "sampling/sampling_logp_difference/mean": 0.018881790339946747,
+      "step": 70
+    },
+    {
+      "clip_ratio/high_max": 1.2836022506235167e-05,
+      "clip_ratio/high_mean": 3.209005626558792e-06,
+      "clip_ratio/low_mean": 3.8109637216621195e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.131864307055366e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16323.0,
+      "completions/mean_length": 7399.7890625,
+      "completions/mean_terminated_length": 7034.5771484375,
+      "completions/min_length": 144.0,
+      "completions/min_terminated_length": 144.0,
+      "entropy": 0.8808257132768631,
+      "epoch": 0.06531738730450783,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.002061733277514577,
+      "learning_rate": 1e-05,
+      "loss": 0.0191,
+      "num_tokens": 53113230.0,
+      "reward": 0.3046875,
+      "reward_std": 0.2580180764198303,
+      "rewards/accuracy_reward/mean": 0.3046875,
+      "rewards/accuracy_reward/std": 0.46208351850509644,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999673962593079,
+      "sampling/importance_sampling_ratio/min": 0.005283349193632603,
+      "sampling/sampling_logp_difference/max": 5.243195056915283,
+      "sampling/sampling_logp_difference/mean": 0.018456293269991875,
+      "step": 71
+    },
+    {
+      "clip_ratio/high_max": 1.5806871488166507e-05,
+      "clip_ratio/high_mean": 4.739466817227367e-06,
+      "clip_ratio/low_mean": 3.610486896832299e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.084433521711617e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16208.0,
+      "completions/mean_length": 5730.9609375,
+      "completions/mean_terminated_length": 5475.2880859375,
+      "completions/min_length": 433.0,
+      "completions/min_terminated_length": 433.0,
+      "entropy": 0.9486126750707626,
+      "epoch": 0.06623735050597976,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0012298432411625981,
+      "learning_rate": 1e-05,
+      "loss": 0.0208,
+      "num_tokens": 53864049.0,
+      "reward": 0.359375,
+      "reward_std": 0.25460314750671387,
+      "rewards/accuracy_reward/mean": 0.359375,
+      "rewards/accuracy_reward/std": 0.481702595949173,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999348521232605,
+      "sampling/importance_sampling_ratio/min": 4.832820559386164e-05,
+      "sampling/sampling_logp_difference/max": 9.937495231628418,
+      "sampling/sampling_logp_difference/mean": 0.01919996738433838,
+      "step": 72
+    },
+    {
+      "clip_ratio/high_max": 1.2390134997986024e-05,
+      "clip_ratio/high_mean": 3.097533749496506e-06,
+      "clip_ratio/low_mean": 3.8867822581778455e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.19653564449618e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 13500.0,
+      "completions/mean_length": 4620.5703125,
+      "completions/mean_terminated_length": 4527.94482421875,
+      "completions/min_length": 364.0,
+      "completions/min_terminated_length": 364.0,
+      "entropy": 0.9557560831308365,
+      "epoch": 0.0671573137074517,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.002882040338590741,
+      "learning_rate": 1e-05,
+      "loss": 0.0609,
+      "num_tokens": 54473498.0,
+      "reward": 0.3984375,
+      "reward_std": 0.39294686913490295,
+      "rewards/accuracy_reward/mean": 0.3984375,
+      "rewards/accuracy_reward/std": 0.4915000796318054,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998915195465088,
+      "sampling/importance_sampling_ratio/min": 1.577107298089686e-07,
+      "sampling/sampling_logp_difference/max": 15.662503242492676,
+      "sampling/sampling_logp_difference/mean": 0.018525000661611557,
+      "step": 73
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 3.088819471486204e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.088819471486204e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 16314.0,
+      "completions/max_terminated_length": 16314.0,
+      "completions/mean_length": 5074.0703125,
+      "completions/mean_terminated_length": 5074.0703125,
+      "completions/min_length": 342.0,
+      "completions/min_terminated_length": 342.0,
+      "entropy": 0.8830869868397713,
+      "epoch": 0.06807727690892364,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.003324020653963089,
+      "learning_rate": 1e-05,
+      "loss": 0.0305,
+      "num_tokens": 55141787.0,
+      "reward": 0.4609375,
+      "reward_std": 0.30115634202957153,
+      "rewards/accuracy_reward/mean": 0.4609375,
+      "rewards/accuracy_reward/std": 0.5004304051399231,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999203681945801,
+      "sampling/importance_sampling_ratio/min": 0.0009876838885247707,
+      "sampling/sampling_logp_difference/max": 6.920147895812988,
+      "sampling/sampling_logp_difference/mean": 0.018072880804538727,
+      "step": 74
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 1.526649884908693e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 1.526649884908693e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15251.0,
+      "completions/max_terminated_length": 15251.0,
+      "completions/mean_length": 6192.1015625,
+      "completions/mean_terminated_length": 6192.1015625,
+      "completions/min_length": 553.0,
+      "completions/min_terminated_length": 553.0,
+      "entropy": 1.0888547226786613,
+      "epoch": 0.06899724011039558,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0017452294705435634,
+      "learning_rate": 1e-05,
+      "loss": 0.0216,
+      "num_tokens": 55954144.0,
+      "reward": 0.2890625,
+      "reward_std": 0.23250606656074524,
+      "rewards/accuracy_reward/mean": 0.2890625,
+      "rewards/accuracy_reward/std": 0.45510825514793396,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999473690986633,
+      "sampling/importance_sampling_ratio/min": 5.061922365712235e-07,
+      "sampling/sampling_logp_difference/max": 14.496349334716797,
+      "sampling/sampling_logp_difference/mean": 0.021221645176410675,
+      "step": 75
+    },
+    {
+      "clip_ratio/high_max": 1.6768677141953958e-05,
+      "clip_ratio/high_mean": 5.080836899651331e-06,
+      "clip_ratio/low_mean": 3.340929970363504e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.84901372854074e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15740.0,
+      "completions/mean_length": 6204.296875,
+      "completions/mean_terminated_length": 6124.1416015625,
+      "completions/min_length": 294.0,
+      "completions/min_terminated_length": 294.0,
+      "entropy": 1.0423575639724731,
+      "epoch": 0.06991720331186753,
+      "frac_reward_zero_std": 0.125,
+      "grad_norm": 0.0033357341308146715,
+      "learning_rate": 1e-05,
+      "loss": 0.1073,
+      "num_tokens": 56765470.0,
+      "reward": 0.3359375,
+      "reward_std": 0.37875816226005554,
+      "rewards/accuracy_reward/mean": 0.3359375,
+      "rewards/accuracy_reward/std": 0.47417303919792175,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.99998539686203,
+      "sampling/importance_sampling_ratio/min": 4.564182381727733e-05,
+      "sampling/sampling_logp_difference/max": 9.994686126708984,
+      "sampling/sampling_logp_difference/mean": 0.01908688060939312,
+      "step": 76
+    },
+    {
+      "clip_ratio/high_max": 3.149884150843718e-06,
+      "clip_ratio/high_mean": 7.874710377109295e-07,
+      "clip_ratio/low_mean": 2.430614893000893e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.509361991087644e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 14409.0,
+      "completions/max_terminated_length": 14409.0,
+      "completions/mean_length": 5070.3125,
+      "completions/mean_terminated_length": 5070.3125,
+      "completions/min_length": 629.0,
+      "completions/min_terminated_length": 629.0,
+      "entropy": 1.0737399458885193,
+      "epoch": 0.07083716651333946,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0038695367984473705,
+      "learning_rate": 1e-05,
+      "loss": 0.0015,
+      "num_tokens": 57432958.0,
+      "reward": 0.390625,
+      "reward_std": 0.22119548916816711,
+      "rewards/accuracy_reward/mean": 0.390625,
+      "rewards/accuracy_reward/std": 0.4898075461387634,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999223947525024,
+      "sampling/importance_sampling_ratio/min": 1.5509348259001854e-06,
+      "sampling/sampling_logp_difference/max": 13.376652717590332,
+      "sampling/sampling_logp_difference/mean": 0.01970684342086315,
+      "step": 77
+    },
+    {
+      "clip_ratio/high_max": 1.9821940441033803e-05,
+      "clip_ratio/high_mean": 4.955485110258451e-06,
+      "clip_ratio/low_mean": 2.9055729555693688e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.401121466595214e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15799.0,
+      "completions/mean_length": 5750.21875,
+      "completions/mean_terminated_length": 5495.00830078125,
+      "completions/min_length": 76.0,
+      "completions/min_terminated_length": 76.0,
+      "entropy": 0.9708107560873032,
+      "epoch": 0.07175712971481141,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.002927646040916443,
+      "learning_rate": 1e-05,
+      "loss": 0.0166,
+      "num_tokens": 58187426.0,
+      "reward": 0.296875,
+      "reward_std": 0.25460314750671387,
+      "rewards/accuracy_reward/mean": 0.296875,
+      "rewards/accuracy_reward/std": 0.45867621898651123,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999390840530396,
+      "sampling/importance_sampling_ratio/min": 0.015204614959657192,
+      "sampling/sampling_logp_difference/max": 4.186156272888184,
+      "sampling/sampling_logp_difference/mean": 0.019483914598822594,
+      "step": 78
+    },
+    {
+      "clip_ratio/high_max": 2.3815636723156786e-05,
+      "clip_ratio/high_mean": 5.953909180789196e-06,
+      "clip_ratio/low_mean": 4.989707144886779e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.585097960647545e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15938.0,
+      "completions/mean_length": 6067.484375,
+      "completions/mean_terminated_length": 5986.251953125,
+      "completions/min_length": 656.0,
+      "completions/min_terminated_length": 656.0,
+      "entropy": 0.9576351121068001,
+      "epoch": 0.07267709291628335,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.0026169484481215477,
+      "learning_rate": 1e-05,
+      "loss": -0.0055,
+      "num_tokens": 58983336.0,
+      "reward": 0.390625,
+      "reward_std": 0.3406373858451843,
+      "rewards/accuracy_reward/mean": 0.390625,
+      "rewards/accuracy_reward/std": 0.4898075461387634,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999620914459229,
+      "sampling/importance_sampling_ratio/min": 1.974713995878119e-06,
+      "sampling/sampling_logp_difference/max": 13.135087013244629,
+      "sampling/sampling_logp_difference/mean": 0.019007554277777672,
+      "step": 79
+    },
+    {
+      "clip_ratio/high_max": 2.4238934656750644e-05,
+      "clip_ratio/high_mean": 7.786730066072778e-06,
+      "clip_ratio/low_mean": 4.5700241571466904e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.3486972547034384e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 13640.0,
+      "completions/max_terminated_length": 13640.0,
+      "completions/mean_length": 4612.8984375,
+      "completions/mean_terminated_length": 4612.8984375,
+      "completions/min_length": 198.0,
+      "completions/min_terminated_length": 198.0,
+      "entropy": 0.9636320173740387,
+      "epoch": 0.07359705611775529,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0015429699560627341,
+      "learning_rate": 1e-05,
+      "loss": -0.018,
+      "num_tokens": 59590763.0,
+      "reward": 0.421875,
+      "reward_std": 0.34139877557754517,
+      "rewards/accuracy_reward/mean": 0.421875,
+      "rewards/accuracy_reward/std": 0.4957992732524872,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999473094940186,
+      "sampling/importance_sampling_ratio/min": 2.5909587364481013e-08,
+      "sampling/sampling_logp_difference/max": 17.468652725219727,
+      "sampling/sampling_logp_difference/mean": 0.019313856959342957,
+      "step": 80
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 3.0911465842109465e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.0911465842109465e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16300.0,
+      "completions/mean_length": 6101.3125,
+      "completions/mean_terminated_length": 5854.5283203125,
+      "completions/min_length": 179.0,
+      "completions/min_terminated_length": 179.0,
+      "entropy": 0.8831139355897903,
+      "epoch": 0.07451701931922723,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0022505265660583973,
+      "learning_rate": 1e-05,
+      "loss": 0.0813,
+      "num_tokens": 60391283.0,
+      "reward": 0.3125,
+      "reward_std": 0.29302334785461426,
+      "rewards/accuracy_reward/mean": 0.3125,
+      "rewards/accuracy_reward/std": 0.4653336703777313,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000061988830566,
+      "sampling/importance_sampling_ratio/min": 0.0003816343960352242,
+      "sampling/sampling_logp_difference/max": 7.871047496795654,
+      "sampling/sampling_logp_difference/mean": 0.018377842381596565,
+      "step": 81
+    },
+    {
+      "clip_ratio/high_max": 1.547606643725885e-05,
+      "clip_ratio/high_mean": 3.869016609314713e-06,
+      "clip_ratio/low_mean": 2.478705800967873e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.8656074391619768e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14862.0,
+      "completions/mean_length": 4705.9921875,
+      "completions/mean_terminated_length": 4614.03955078125,
+      "completions/min_length": 86.0,
+      "completions/min_terminated_length": 86.0,
+      "entropy": 0.9557913094758987,
+      "epoch": 0.07543698252069918,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.002069958718493581,
+      "learning_rate": 1e-05,
+      "loss": -0.0015,
+      "num_tokens": 61021490.0,
+      "reward": 0.4296875,
+      "reward_std": 0.2637920379638672,
+      "rewards/accuracy_reward/mean": 0.4296875,
+      "rewards/accuracy_reward/std": 0.4969765841960907,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999030232429504,
+      "sampling/importance_sampling_ratio/min": 2.76673017651774e-05,
+      "sampling/sampling_logp_difference/max": 10.495259284973145,
+      "sampling/sampling_logp_difference/mean": 0.018629569560289383,
+      "step": 82
+    },
+    {
+      "clip_ratio/high_max": 2.0910484636260662e-05,
+      "clip_ratio/high_mean": 5.2276211590651656e-06,
+      "clip_ratio/low_mean": 1.952954164607945e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.4757162805144617e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 13745.0,
+      "completions/max_terminated_length": 13745.0,
+      "completions/mean_length": 5116.78125,
+      "completions/mean_terminated_length": 5116.78125,
+      "completions/min_length": 12.0,
+      "completions/min_terminated_length": 12.0,
+      "entropy": 1.0198405236005783,
+      "epoch": 0.07635694572217111,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0034461067989468575,
+      "learning_rate": 1e-05,
+      "loss": -0.0073,
+      "num_tokens": 61695382.0,
+      "reward": 0.265625,
+      "reward_std": 0.30774885416030884,
+      "rewards/accuracy_reward/mean": 0.265625,
+      "rewards/accuracy_reward/std": 0.44340085983276367,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999936819076538,
+      "sampling/importance_sampling_ratio/min": 0.012227212078869343,
+      "sampling/sampling_logp_difference/max": 4.4040913581848145,
+      "sampling/sampling_logp_difference/mean": 0.019400250166654587,
+      "step": 83
+    },
+    {
+      "clip_ratio/high_max": 1.5340228401328204e-05,
+      "clip_ratio/high_mean": 3.835057100332051e-06,
+      "clip_ratio/low_mean": 3.150914017169271e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.534419727202476e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15953.0,
+      "completions/mean_length": 5891.9140625,
+      "completions/mean_terminated_length": 5553.45947265625,
+      "completions/min_length": 79.0,
+      "completions/min_terminated_length": 79.0,
+      "entropy": 0.9568078517913818,
+      "epoch": 0.07727690892364306,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0025854657869786024,
+      "learning_rate": 1e-05,
+      "loss": 0.1013,
+      "num_tokens": 62474883.0,
+      "reward": 0.3203125,
+      "reward_std": 0.31010788679122925,
+      "rewards/accuracy_reward/mean": 0.3203125,
+      "rewards/accuracy_reward/std": 0.4684300124645233,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0001013278961182,
+      "sampling/importance_sampling_ratio/min": 0.0015072470996528864,
+      "sampling/sampling_logp_difference/max": 6.497470378875732,
+      "sampling/sampling_logp_difference/mean": 0.019574139267206192,
+      "step": 84
+    },
+    {
+      "clip_ratio/high_max": 1.108303422370227e-05,
+      "clip_ratio/high_mean": 2.7707585559255676e-06,
+      "clip_ratio/low_mean": 2.2325777763398946e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.5096536319324514e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 13671.0,
+      "completions/mean_length": 5300.3359375,
+      "completions/mean_terminated_length": 5213.06298828125,
+      "completions/min_length": 17.0,
+      "completions/min_terminated_length": 17.0,
+      "entropy": 0.9722280204296112,
+      "epoch": 0.078196872125115,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0025075653102248907,
+      "learning_rate": 1e-05,
+      "loss": 0.0312,
+      "num_tokens": 63172454.0,
+      "reward": 0.203125,
+      "reward_std": 0.21542152762413025,
+      "rewards/accuracy_reward/mean": 0.203125,
+      "rewards/accuracy_reward/std": 0.40390563011169434,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999643564224243,
+      "sampling/importance_sampling_ratio/min": 0.00020346972451079637,
+      "sampling/sampling_logp_difference/max": 8.499993324279785,
+      "sampling/sampling_logp_difference/mean": 0.02002432942390442,
+      "step": 85
+    },
+    {
+      "clip_ratio/high_max": 1.3991947980684927e-05,
+      "clip_ratio/high_mean": 3.4979869951712317e-06,
+      "clip_ratio/low_mean": 4.893367201930232e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.243165958290774e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15617.0,
+      "completions/mean_length": 6364.21875,
+      "completions/mean_terminated_length": 6205.1748046875,
+      "completions/min_length": 215.0,
+      "completions/min_terminated_length": 215.0,
+      "entropy": 1.0607495978474617,
+      "epoch": 0.07911683532658693,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0017982006538659334,
+      "learning_rate": 1e-05,
+      "loss": -0.0117,
+      "num_tokens": 64007602.0,
+      "reward": 0.2890625,
+      "reward_std": 0.21778053045272827,
+      "rewards/accuracy_reward/mean": 0.2890625,
+      "rewards/accuracy_reward/std": 0.45510825514793396,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000064373016357,
+      "sampling/importance_sampling_ratio/min": 3.823801307589747e-05,
+      "sampling/sampling_logp_difference/max": 10.171680450439453,
+      "sampling/sampling_logp_difference/mean": 0.020373597741127014,
+      "step": 86
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 2.6416430046083406e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.6416430046083406e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14709.0,
+      "completions/mean_length": 5746.3125,
+      "completions/mean_terminated_length": 5403.1611328125,
+      "completions/min_length": 287.0,
+      "completions/min_terminated_length": 287.0,
+      "entropy": 0.9913106113672256,
+      "epoch": 0.08003679852805888,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.002207317156717181,
+      "learning_rate": 1e-05,
+      "loss": 0.063,
+      "num_tokens": 64762058.0,
+      "reward": 0.34375,
+      "reward_std": 0.3264310359954834,
+      "rewards/accuracy_reward/mean": 0.34375,
+      "rewards/accuracy_reward/std": 0.47682511806488037,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999239444732666,
+      "sampling/importance_sampling_ratio/min": 5.3444750847120304e-08,
+      "sampling/sampling_logp_difference/max": 16.744617462158203,
+      "sampling/sampling_logp_difference/mean": 0.020608089864253998,
+      "step": 87
+    },
+    {
+      "clip_ratio/high_max": 1.2681661701208213e-05,
+      "clip_ratio/high_mean": 3.1704154253020533e-06,
+      "clip_ratio/low_mean": 3.541917828897567e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.85895939416514e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16292.0,
+      "completions/mean_length": 6088.5625,
+      "completions/mean_terminated_length": 5841.47216796875,
+      "completions/min_length": 159.0,
+      "completions/min_terminated_length": 159.0,
+      "entropy": 0.9040444120764732,
+      "epoch": 0.08095676172953081,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0012974507408216596,
+      "learning_rate": 1e-05,
+      "loss": 0.0401,
+      "num_tokens": 65561002.0,
+      "reward": 0.3671875,
+      "reward_std": 0.2477683573961258,
+      "rewards/accuracy_reward/mean": 0.3671875,
+      "rewards/accuracy_reward/std": 0.4839322865009308,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998487234115601,
+      "sampling/importance_sampling_ratio/min": 6.021501121722395e-06,
+      "sampling/sampling_logp_difference/max": 12.020174026489258,
+      "sampling/sampling_logp_difference/mean": 0.01939838007092476,
+      "step": 88
+    },
+    {
+      "clip_ratio/high_max": 7.807132533343975e-06,
+      "clip_ratio/high_mean": 1.9517831333359936e-06,
+      "clip_ratio/low_mean": 1.8564539345788944e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.05163223654381e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15021.0,
+      "completions/mean_length": 5765.5,
+      "completions/mean_terminated_length": 5510.65625,
+      "completions/min_length": 412.0,
+      "completions/min_terminated_length": 412.0,
+      "entropy": 0.9966336265206337,
+      "epoch": 0.08187672493100276,
+      "frac_reward_zero_std": 0.6875,
+      "grad_norm": 0.0013380619930103421,
+      "learning_rate": 1e-05,
+      "loss": 0.0522,
+      "num_tokens": 66318482.0,
+      "reward": 0.375,
+      "reward_std": 0.13994136452674866,
+      "rewards/accuracy_reward/mean": 0.375,
+      "rewards/accuracy_reward/std": 0.4860251843929291,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999471306800842,
+      "sampling/importance_sampling_ratio/min": 7.288413598871557e-06,
+      "sampling/sampling_logp_difference/max": 11.829224586486816,
+      "sampling/sampling_logp_difference/mean": 0.018109245225787163,
+      "step": 89
+    },
+    {
+      "clip_ratio/high_max": 1.7906912489706883e-05,
+      "clip_ratio/high_mean": 4.476728122426721e-06,
+      "clip_ratio/low_mean": 2.5812531305291486e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.0289259655091882e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16120.0,
+      "completions/mean_length": 5462.78125,
+      "completions/mean_terminated_length": 5200.67236328125,
+      "completions/min_length": 460.0,
+      "completions/min_terminated_length": 460.0,
+      "entropy": 0.9345141425728798,
+      "epoch": 0.0827966881324747,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0023930128663778305,
+      "learning_rate": 1e-05,
+      "loss": 0.0475,
+      "num_tokens": 67038582.0,
+      "reward": 0.46875,
+      "reward_std": 0.24435341358184814,
+      "rewards/accuracy_reward/mean": 0.46875,
+      "rewards/accuracy_reward/std": 0.5009832978248596,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999513030052185,
+      "sampling/importance_sampling_ratio/min": 0.008508839644491673,
+      "sampling/sampling_logp_difference/max": 4.7666497230529785,
+      "sampling/sampling_logp_difference/mean": 0.019220296293497086,
+      "step": 90
+    },
+    {
+      "clip_ratio/high_max": 1.551389118503721e-05,
+      "clip_ratio/high_mean": 3.878472796259302e-06,
+      "clip_ratio/low_mean": 3.239646628117043e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.6274939645863924e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15034.0,
+      "completions/max_terminated_length": 15034.0,
+      "completions/mean_length": 5547.5078125,
+      "completions/mean_terminated_length": 5547.5078125,
+      "completions/min_length": 14.0,
+      "completions/min_terminated_length": 14.0,
+      "entropy": 1.0511749312281609,
+      "epoch": 0.08371665133394664,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.0013633714988827705,
+      "learning_rate": 1e-05,
+      "loss": 0.0462,
+      "num_tokens": 67774487.0,
+      "reward": 0.203125,
+      "reward_std": 0.19438527524471283,
+      "rewards/accuracy_reward/mean": 0.203125,
+      "rewards/accuracy_reward/std": 0.40390563011169434,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999545216560364,
+      "sampling/importance_sampling_ratio/min": 1.0995515367540065e-05,
+      "sampling/sampling_logp_difference/max": 11.418023109436035,
+      "sampling/sampling_logp_difference/mean": 0.020328814163804054,
+      "step": 91
+    },
+    {
+      "clip_ratio/high_max": 1.5384989410449634e-05,
+      "clip_ratio/high_mean": 3.846247352612409e-06,
+      "clip_ratio/low_mean": 3.441604167164769e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.826228908110352e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14029.0,
+      "completions/mean_length": 5835.4140625,
+      "completions/mean_terminated_length": 5406.609375,
+      "completions/min_length": 384.0,
+      "completions/min_terminated_length": 384.0,
+      "entropy": 1.0024723336100578,
+      "epoch": 0.08463661453541858,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0036165034398436546,
+      "learning_rate": 1e-05,
+      "loss": 0.0373,
+      "num_tokens": 68541660.0,
+      "reward": 0.34375,
+      "reward_std": 0.3584783673286438,
+      "rewards/accuracy_reward/mean": 0.34375,
+      "rewards/accuracy_reward/std": 0.47682511806488037,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999669790267944,
+      "sampling/importance_sampling_ratio/min": 9.518130354990717e-06,
+      "sampling/sampling_logp_difference/max": 11.562312126159668,
+      "sampling/sampling_logp_difference/mean": 0.020469525828957558,
+      "step": 92
+    },
+    {
+      "clip_ratio/high_max": 6.105602551542688e-06,
+      "clip_ratio/high_mean": 1.526400637885672e-06,
+      "clip_ratio/low_mean": 5.3129634352444555e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.46560352177039e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15695.0,
+      "completions/mean_length": 6252.609375,
+      "completions/mean_terminated_length": 6172.83447265625,
+      "completions/min_length": 481.0,
+      "completions/min_terminated_length": 481.0,
+      "entropy": 1.0325519517064095,
+      "epoch": 0.08555657773689053,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0022011541295796633,
+      "learning_rate": 1e-05,
+      "loss": 0.036,
+      "num_tokens": 69365418.0,
+      "reward": 0.3828125,
+      "reward_std": 0.32301604747772217,
+      "rewards/accuracy_reward/mean": 0.3828125,
+      "rewards/accuracy_reward/std": 0.4879830479621887,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998809099197388,
+      "sampling/importance_sampling_ratio/min": 0.0005531083443202078,
+      "sampling/sampling_logp_difference/max": 7.4999566078186035,
+      "sampling/sampling_logp_difference/mean": 0.02079072594642639,
+      "step": 93
+    },
+    {
+      "clip_ratio/high_max": 4.348128641140647e-06,
+      "clip_ratio/high_mean": 1.0870321602851618e-06,
+      "clip_ratio/low_mean": 3.0097819148977578e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.118485085451539e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15316.0,
+      "completions/max_terminated_length": 15316.0,
+      "completions/mean_length": 5581.484375,
+      "completions/mean_terminated_length": 5581.484375,
+      "completions/min_length": 461.0,
+      "completions/min_terminated_length": 461.0,
+      "entropy": 0.9222500994801521,
+      "epoch": 0.08647654093836246,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.002300912281498313,
+      "learning_rate": 1e-05,
+      "loss": -0.0007,
+      "num_tokens": 70099320.0,
+      "reward": 0.296875,
+      "reward_std": 0.2959064245223999,
+      "rewards/accuracy_reward/mean": 0.296875,
+      "rewards/accuracy_reward/std": 0.45867621898651123,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998577833175659,
+      "sampling/importance_sampling_ratio/min": 8.140386853483506e-08,
+      "sampling/sampling_logp_difference/max": 16.323843002319336,
+      "sampling/sampling_logp_difference/mean": 0.01952272653579712,
+      "step": 94
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 3.5122252029395895e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.5122252029395895e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15781.0,
+      "completions/max_terminated_length": 15781.0,
+      "completions/mean_length": 5424.140625,
+      "completions/mean_terminated_length": 5424.140625,
+      "completions/min_length": 124.0,
+      "completions/min_terminated_length": 124.0,
+      "entropy": 1.0446564108133316,
+      "epoch": 0.08739650413983441,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0016312639927491546,
+      "learning_rate": 1e-05,
+      "loss": 0.0119,
+      "num_tokens": 70811474.0,
+      "reward": 0.359375,
+      "reward_std": 0.31246691942214966,
+      "rewards/accuracy_reward/mean": 0.359375,
+      "rewards/accuracy_reward/std": 0.481702595949173,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000094175338745,
+      "sampling/importance_sampling_ratio/min": 0.0021919538266956806,
+      "sampling/sampling_logp_difference/max": 6.12296199798584,
+      "sampling/sampling_logp_difference/mean": 0.019741754978895187,
+      "step": 95
+    },
+    {
+      "clip_ratio/high_max": 1.0354576261306647e-05,
+      "clip_ratio/high_mean": 3.496124691082514e-06,
+      "clip_ratio/low_mean": 4.096481598026003e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.446094089871622e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15755.0,
+      "completions/max_terminated_length": 15755.0,
+      "completions/mean_length": 5884.9609375,
+      "completions/mean_terminated_length": 5884.9609375,
+      "completions/min_length": 382.0,
+      "completions/min_terminated_length": 382.0,
+      "entropy": 0.9605691060423851,
+      "epoch": 0.08831646734130635,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0032865386456251144,
+      "learning_rate": 1e-05,
+      "loss": 0.0451,
+      "num_tokens": 71582701.0,
+      "reward": 0.4140625,
+      "reward_std": 0.3514111638069153,
+      "rewards/accuracy_reward/mean": 0.4140625,
+      "rewards/accuracy_reward/std": 0.49449479579925537,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999833106994629,
+      "sampling/importance_sampling_ratio/min": 1.149311810877407e-05,
+      "sampling/sampling_logp_difference/max": 11.373762130737305,
+      "sampling/sampling_logp_difference/mean": 0.019438734278082848,
+      "step": 96
+    },
+    {
+      "clip_ratio/high_max": 1.026998006636859e-05,
+      "clip_ratio/high_mean": 2.5674950165921473e-06,
+      "clip_ratio/low_mean": 3.5440503552308655e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.8007998455213965e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15361.0,
+      "completions/max_terminated_length": 15361.0,
+      "completions/mean_length": 4835.09375,
+      "completions/mean_terminated_length": 4835.09375,
+      "completions/min_length": 826.0,
+      "completions/min_terminated_length": 826.0,
+      "entropy": 0.9038172215223312,
+      "epoch": 0.08923643054277829,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.004721678793430328,
+      "learning_rate": 1e-05,
+      "loss": 0.1143,
+      "num_tokens": 72220025.0,
+      "reward": 0.4765625,
+      "reward_std": 0.38481879234313965,
+      "rewards/accuracy_reward/mean": 0.4765625,
+      "rewards/accuracy_reward/std": 0.5014128684997559,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.99994957447052,
+      "sampling/importance_sampling_ratio/min": 2.710051205667696e-07,
+      "sampling/sampling_logp_difference/max": 15.12112808227539,
+      "sampling/sampling_logp_difference/mean": 0.017888439819216728,
+      "step": 97
+    },
+    {
+      "clip_ratio/high_max": 2.93432283342554e-05,
+      "clip_ratio/high_mean": 9.56252398509605e-06,
+      "clip_ratio/low_mean": 4.7865792453194445e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.742831808674964e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14431.0,
+      "completions/mean_length": 5979.078125,
+      "completions/mean_terminated_length": 5897.1494140625,
+      "completions/min_length": 241.0,
+      "completions/min_terminated_length": 241.0,
+      "entropy": 1.0227951630949974,
+      "epoch": 0.09015639374425023,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0010532280430197716,
+      "learning_rate": 1e-05,
+      "loss": 0.0187,
+      "num_tokens": 73005515.0,
+      "reward": 0.2890625,
+      "reward_std": 0.30115631222724915,
+      "rewards/accuracy_reward/mean": 0.2890625,
+      "rewards/accuracy_reward/std": 0.45510825514793396,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999090433120728,
+      "sampling/importance_sampling_ratio/min": 0.00030157779110595584,
+      "sampling/sampling_logp_difference/max": 8.10648250579834,
+      "sampling/sampling_logp_difference/mean": 0.019633149728178978,
+      "step": 98
+    },
+    {
+      "clip_ratio/high_max": 4.203234766464448e-06,
+      "clip_ratio/high_mean": 1.050808691616112e-06,
+      "clip_ratio/low_mean": 2.5574990331733716e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.6625799137036665e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15886.0,
+      "completions/max_terminated_length": 15886.0,
+      "completions/mean_length": 4292.1796875,
+      "completions/mean_terminated_length": 4292.1796875,
+      "completions/min_length": 159.0,
+      "completions/min_terminated_length": 159.0,
+      "entropy": 0.8719984591007233,
+      "epoch": 0.09107635694572216,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0038324075285345316,
+      "learning_rate": 1e-05,
+      "loss": 0.0669,
+      "num_tokens": 73572794.0,
+      "reward": 0.4375,
+      "reward_std": 0.2972046136856079,
+      "rewards/accuracy_reward/mean": 0.4375,
+      "rewards/accuracy_reward/std": 0.49802759289741516,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999188780784607,
+      "sampling/importance_sampling_ratio/min": 0.015675775706768036,
+      "sampling/sampling_logp_difference/max": 4.155638694763184,
+      "sampling/sampling_logp_difference/mean": 0.018074234947562218,
+      "step": 99
+    },
+    {
+      "clip_ratio/high_max": 4.431366960488958e-06,
+      "clip_ratio/high_mean": 1.1078417401222396e-06,
+      "clip_ratio/low_mean": 4.433405501913512e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.54418968729442e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 14674.0,
+      "completions/max_terminated_length": 14674.0,
+      "completions/mean_length": 5449.2890625,
+      "completions/mean_terminated_length": 5449.2890625,
+      "completions/min_length": 635.0,
+      "completions/min_terminated_length": 635.0,
+      "entropy": 0.9137986451387405,
+      "epoch": 0.09199632014719411,
+      "frac_reward_zero_std": 0.125,
+      "grad_norm": 0.004843447357416153,
+      "learning_rate": 1e-05,
+      "loss": 0.0166,
+      "num_tokens": 74289607.0,
+      "reward": 0.5,
+      "reward_std": 0.40609243512153625,
+      "rewards/accuracy_reward/mean": 0.5,
+      "rewards/accuracy_reward/std": 0.5019646286964417,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999977707862854,
+      "sampling/importance_sampling_ratio/min": 8.851584993863071e-07,
+      "sampling/sampling_logp_difference/max": 13.937499046325684,
+      "sampling/sampling_logp_difference/mean": 0.018183842301368713,
+      "step": 100
+    },
+    {
+      "clip_ratio/high_max": 8.212076863856055e-06,
+      "clip_ratio/high_mean": 2.0530192159640137e-06,
+      "clip_ratio/low_mean": 3.6279372466196946e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.833239122741361e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 16163.0,
+      "completions/max_terminated_length": 16163.0,
+      "completions/mean_length": 4983.3515625,
+      "completions/mean_terminated_length": 4983.3515625,
+      "completions/min_length": 541.0,
+      "completions/min_terminated_length": 541.0,
+      "entropy": 0.9354705810546875,
+      "epoch": 0.09291628334866606,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0037651765160262585,
+      "learning_rate": 1e-05,
+      "loss": 0.0463,
+      "num_tokens": 74946484.0,
+      "reward": 0.3671875,
+      "reward_std": 0.3090519309043884,
+      "rewards/accuracy_reward/mean": 0.3671875,
+      "rewards/accuracy_reward/std": 0.4839322865009308,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999549984931946,
+      "sampling/importance_sampling_ratio/min": 0.00011593531962716952,
+      "sampling/sampling_logp_difference/max": 9.062478065490723,
+      "sampling/sampling_logp_difference/mean": 0.018207306042313576,
+      "step": 101
+    },
+    {
+      "clip_ratio/high_max": 1.3182888324081432e-05,
+      "clip_ratio/high_mean": 3.295722081020358e-06,
+      "clip_ratio/low_mean": 2.544108633628639e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.8736808644680423e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16039.0,
+      "completions/mean_length": 6351.1015625,
+      "completions/mean_terminated_length": 6027.45947265625,
+      "completions/min_length": 804.0,
+      "completions/min_terminated_length": 804.0,
+      "entropy": 0.9310042560100555,
+      "epoch": 0.09383624655013799,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0009160125628113747,
+      "learning_rate": 1e-05,
+      "loss": -0.023,
+      "num_tokens": 75779145.0,
+      "reward": 0.3828125,
+      "reward_std": 0.24329257011413574,
+      "rewards/accuracy_reward/mean": 0.3828125,
+      "rewards/accuracy_reward/std": 0.4879830479621887,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998877048492432,
+      "sampling/importance_sampling_ratio/min": 0.0002961359277833253,
+      "sampling/sampling_logp_difference/max": 8.1246919631958,
+      "sampling/sampling_logp_difference/mean": 0.018513178452849388,
+      "step": 102
+    },
+    {
+      "clip_ratio/high_max": 1.1402620202716207e-05,
+      "clip_ratio/high_mean": 3.935649147024378e-06,
+      "clip_ratio/low_mean": 3.059757568735222e-05,
+      "clip_ratio/low_min": 4.3258582991256844e-06,
+      "clip_ratio/region_mean": 3.45332257438713e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14471.0,
+      "completions/mean_length": 5293.40625,
+      "completions/mean_terminated_length": 4935.64501953125,
+      "completions/min_length": 222.0,
+      "completions/min_terminated_length": 222.0,
+      "entropy": 1.0732879787683487,
+      "epoch": 0.09475620975160993,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0023993055801838636,
+      "learning_rate": 1e-05,
+      "loss": 0.1021,
+      "num_tokens": 76475557.0,
+      "reward": 0.34375,
+      "reward_std": 0.2835350036621094,
+      "rewards/accuracy_reward/mean": 0.34375,
+      "rewards/accuracy_reward/std": 0.47682511806488037,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.000077724456787,
+      "sampling/importance_sampling_ratio/min": 6.613240111619234e-05,
+      "sampling/sampling_logp_difference/max": 9.623851776123047,
+      "sampling/sampling_logp_difference/mean": 0.020792219787836075,
+      "step": 103
+    },
+    {
+      "clip_ratio/high_max": 2.130644793396641e-05,
+      "clip_ratio/high_mean": 8.929533635182452e-06,
+      "clip_ratio/low_mean": 2.663600798769039e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.556554071337814e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16305.0,
+      "completions/mean_length": 7619.7578125,
+      "completions/mean_terminated_length": 7409.41650390625,
+      "completions/min_length": 25.0,
+      "completions/min_terminated_length": 25.0,
+      "entropy": 0.9646238535642624,
+      "epoch": 0.09567617295308188,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0014872358879074454,
+      "learning_rate": 1e-05,
+      "loss": 0.0439,
+      "num_tokens": 77474310.0,
+      "reward": 0.34375,
+      "reward_std": 0.33114904165267944,
+      "rewards/accuracy_reward/mean": 0.34375,
+      "rewards/accuracy_reward/std": 0.47682511806488037,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999638795852661,
+      "sampling/importance_sampling_ratio/min": 0.0016686831368133426,
+      "sampling/sampling_logp_difference/max": 6.395720481872559,
+      "sampling/sampling_logp_difference/mean": 0.020074717700481415,
+      "step": 104
+    },
+    {
+      "clip_ratio/high_max": 1.7765815300663235e-05,
+      "clip_ratio/high_mean": 5.154013138053415e-06,
+      "clip_ratio/low_mean": 5.166909659237717e-05,
+      "clip_ratio/low_min": 8.365680514543783e-06,
+      "clip_ratio/region_mean": 5.68231100714911e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15984.0,
+      "completions/max_terminated_length": 15984.0,
+      "completions/mean_length": 5959.921875,
+      "completions/mean_terminated_length": 5959.921875,
+      "completions/min_length": 55.0,
+      "completions/min_terminated_length": 55.0,
+      "entropy": 1.004471093416214,
+      "epoch": 0.09659613615455381,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.00398358516395092,
+      "learning_rate": 1e-05,
+      "loss": 0.1016,
+      "num_tokens": 78257132.0,
+      "reward": 0.359375,
+      "reward_std": 0.3653082847595215,
+      "rewards/accuracy_reward/mean": 0.359375,
+      "rewards/accuracy_reward/std": 0.481702595949173,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000170469284058,
+      "sampling/importance_sampling_ratio/min": 0.0030075267422944307,
+      "sampling/sampling_logp_difference/max": 5.806637287139893,
+      "sampling/sampling_logp_difference/mean": 0.020755283534526825,
+      "step": 105
+    },
+    {
+      "clip_ratio/high_max": 1.6946955838648137e-05,
+      "clip_ratio/high_mean": 4.236738959662034e-06,
+      "clip_ratio/low_mean": 4.510891039899434e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.934564867653535e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 13736.0,
+      "completions/mean_length": 5427.03125,
+      "completions/mean_terminated_length": 5340.755859375,
+      "completions/min_length": 556.0,
+      "completions/min_terminated_length": 556.0,
+      "entropy": 0.9117375314235687,
+      "epoch": 0.09751609935602576,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0019883522763848305,
+      "learning_rate": 1e-05,
+      "loss": 0.01,
+      "num_tokens": 78971072.0,
+      "reward": 0.375,
+      "reward_std": 0.31694266200065613,
+      "rewards/accuracy_reward/mean": 0.375,
+      "rewards/accuracy_reward/std": 0.4860251843929291,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000550746917725,
+      "sampling/importance_sampling_ratio/min": 0.0008046010043472052,
+      "sampling/sampling_logp_difference/max": 7.125164031982422,
+      "sampling/sampling_logp_difference/mean": 0.018812140449881554,
+      "step": 106
+    },
+    {
+      "clip_ratio/high_max": 2.968176841022796e-05,
+      "clip_ratio/high_mean": 7.42044210255699e-06,
+      "clip_ratio/low_mean": 3.220799408154562e-05,
+      "clip_ratio/low_min": 5.315981979947537e-06,
+      "clip_ratio/region_mean": 3.962843629778945e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 16293.0,
+      "completions/max_terminated_length": 16293.0,
+      "completions/mean_length": 6062.078125,
+      "completions/mean_terminated_length": 6062.078125,
+      "completions/min_length": 108.0,
+      "completions/min_terminated_length": 108.0,
+      "entropy": 1.0164100378751755,
+      "epoch": 0.0984360625574977,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.00450351694598794,
+      "learning_rate": 1e-05,
+      "loss": 0.0426,
+      "num_tokens": 79764434.0,
+      "reward": 0.2578125,
+      "reward_std": 0.26355957984924316,
+      "rewards/accuracy_reward/mean": 0.2578125,
+      "rewards/accuracy_reward/std": 0.43914902210235596,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999713897705078,
+      "sampling/importance_sampling_ratio/min": 0.0007411236292682588,
+      "sampling/sampling_logp_difference/max": 7.207343101501465,
+      "sampling/sampling_logp_difference/mean": 0.020526543259620667,
+      "step": 107
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 4.856050622947805e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.856050622947805e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 13689.0,
+      "completions/max_terminated_length": 13689.0,
+      "completions/mean_length": 4856.53125,
+      "completions/mean_terminated_length": 4856.53125,
+      "completions/min_length": 191.0,
+      "completions/min_terminated_length": 191.0,
+      "entropy": 1.0780886858701706,
+      "epoch": 0.09935602575896964,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.0033157530706375837,
+      "learning_rate": 1e-05,
+      "loss": 0.046,
+      "num_tokens": 80405238.0,
+      "reward": 0.3359375,
+      "reward_std": 0.3487703502178192,
+      "rewards/accuracy_reward/mean": 0.3359375,
+      "rewards/accuracy_reward/std": 0.47417303919792175,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999889135360718,
+      "sampling/importance_sampling_ratio/min": 0.033773623406887054,
+      "sampling/sampling_logp_difference/max": 3.7256407737731934,
+      "sampling/sampling_logp_difference/mean": 0.019188418984413147,
+      "step": 108
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 1.975351790406421e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 1.975351790406421e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 16335.0,
+      "completions/max_terminated_length": 16335.0,
+      "completions/mean_length": 3930.5859375,
+      "completions/mean_terminated_length": 3930.5859375,
+      "completions/min_length": 4.0,
+      "completions/min_terminated_length": 4.0,
+      "entropy": 0.8666863515973091,
+      "epoch": 0.10027598896044158,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.005471619311720133,
+      "learning_rate": 1e-05,
+      "loss": -0.0779,
+      "num_tokens": 80926721.0,
+      "reward": 0.5859375,
+      "reward_std": 0.3164186179637909,
+      "rewards/accuracy_reward/mean": 0.5859375,
+      "rewards/accuracy_reward/std": 0.49449479579925537,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000040531158447,
+      "sampling/importance_sampling_ratio/min": 0.0002562212466727942,
+      "sampling/sampling_logp_difference/max": 8.269469261169434,
+      "sampling/sampling_logp_difference/mean": 0.017708823084831238,
+      "step": 109
+    },
+    {
+      "clip_ratio/high_max": 6.743997801095247e-06,
+      "clip_ratio/high_mean": 1.6859994502738118e-06,
+      "clip_ratio/low_mean": 3.61007656692891e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.7786765119562915e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15546.0,
+      "completions/mean_length": 5934.9453125,
+      "completions/mean_terminated_length": 5684.16845703125,
+      "completions/min_length": 229.0,
+      "completions/min_terminated_length": 229.0,
+      "entropy": 0.9991667941212654,
+      "epoch": 0.10119595216191353,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.002580739092081785,
+      "learning_rate": 1e-05,
+      "loss": -0.0065,
+      "num_tokens": 81707978.0,
+      "reward": 0.3046875,
+      "reward_std": 0.24671243131160736,
+      "rewards/accuracy_reward/mean": 0.3046875,
+      "rewards/accuracy_reward/std": 0.46208351850509644,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000852346420288,
+      "sampling/importance_sampling_ratio/min": 0.002478762762621045,
+      "sampling/sampling_logp_difference/max": 5.999995708465576,
+      "sampling/sampling_logp_difference/mean": 0.019801246002316475,
+      "step": 110
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 2.43532002741631e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.43532002741631e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16010.0,
+      "completions/mean_length": 5866.84375,
+      "completions/mean_terminated_length": 5699.9052734375,
+      "completions/min_length": 499.0,
+      "completions/min_terminated_length": 499.0,
+      "entropy": 0.9848997294902802,
+      "epoch": 0.10211591536338546,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0010949905263260007,
+      "learning_rate": 1e-05,
+      "loss": 0.0266,
+      "num_tokens": 82477310.0,
+      "reward": 0.2734375,
+      "reward_std": 0.26933354139328003,
+      "rewards/accuracy_reward/mean": 0.2734375,
+      "rewards/accuracy_reward/std": 0.447474867105484,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999667406082153,
+      "sampling/importance_sampling_ratio/min": 9.04304688447155e-05,
+      "sampling/sampling_logp_difference/max": 9.310929298400879,
+      "sampling/sampling_logp_difference/mean": 0.020769795402884483,
+      "step": 111
+    },
+    {
+      "clip_ratio/high_max": 1.9307613456476247e-05,
+      "clip_ratio/high_mean": 4.826903364119062e-06,
+      "clip_ratio/low_mean": 5.842190330440644e-05,
+      "clip_ratio/low_min": 1.2287753634154797e-05,
+      "clip_ratio/region_mean": 6.324880496322294e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 14501.0,
+      "completions/max_terminated_length": 14501.0,
+      "completions/mean_length": 6613.7578125,
+      "completions/mean_terminated_length": 6613.7578125,
+      "completions/min_length": 1033.0,
+      "completions/min_terminated_length": 1033.0,
+      "entropy": 0.9176012054085732,
+      "epoch": 0.10303587856485741,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0020384234376251698,
+      "learning_rate": 1e-05,
+      "loss": 0.0571,
+      "num_tokens": 83345055.0,
+      "reward": 0.3671875,
+      "reward_std": 0.2845958471298218,
+      "rewards/accuracy_reward/mean": 0.3671875,
+      "rewards/accuracy_reward/std": 0.4839322865009308,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999457001686096,
+      "sampling/importance_sampling_ratio/min": 0.029541675001382828,
+      "sampling/sampling_logp_difference/max": 3.5219533443450928,
+      "sampling/sampling_logp_difference/mean": 0.018883168697357178,
+      "step": 112
+    },
+    {
+      "clip_ratio/high_max": 1.382043183184578e-05,
+      "clip_ratio/high_mean": 3.455107957961445e-06,
+      "clip_ratio/low_mean": 5.789885449303256e-05,
+      "clip_ratio/low_min": 1.017130716718384e-05,
+      "clip_ratio/region_mean": 6.135396188255982e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16310.0,
+      "completions/mean_length": 6392.3125,
+      "completions/mean_terminated_length": 6070.0,
+      "completions/min_length": 507.0,
+      "completions/min_terminated_length": 507.0,
+      "entropy": 0.904954232275486,
+      "epoch": 0.10395584176632934,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0031166900880634785,
+      "learning_rate": 1e-05,
+      "loss": 0.0351,
+      "num_tokens": 84186343.0,
+      "reward": 0.390625,
+      "reward_std": 0.34139877557754517,
+      "rewards/accuracy_reward/mean": 0.390625,
+      "rewards/accuracy_reward/std": 0.4898075461387634,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999208450317383,
+      "sampling/importance_sampling_ratio/min": 0.00022529886336997151,
+      "sampling/sampling_logp_difference/max": 8.398082733154297,
+      "sampling/sampling_logp_difference/mean": 0.01931958645582199,
+      "step": 113
+    },
+    {
+      "clip_ratio/high_max": 1.7221671441802755e-05,
+      "clip_ratio/high_mean": 6.549099907715572e-06,
+      "clip_ratio/low_mean": 3.147818074467068e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.802728065238625e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16180.0,
+      "completions/mean_length": 5982.703125,
+      "completions/mean_terminated_length": 5817.603515625,
+      "completions/min_length": 294.0,
+      "completions/min_terminated_length": 294.0,
+      "entropy": 0.8394555225968361,
+      "epoch": 0.10487580496780129,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0022041688207536936,
+      "learning_rate": 1e-05,
+      "loss": 0.1043,
+      "num_tokens": 84971129.0,
+      "reward": 0.3125,
+      "reward_std": 0.30774885416030884,
+      "rewards/accuracy_reward/mean": 0.3125,
+      "rewards/accuracy_reward/std": 0.4653336703777313,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999030828475952,
+      "sampling/importance_sampling_ratio/min": 1.553593506287143e-06,
+      "sampling/sampling_logp_difference/max": 13.374939918518066,
+      "sampling/sampling_logp_difference/mean": 0.01795877143740654,
+      "step": 114
+    },
+    {
+      "clip_ratio/high_max": 2.9651660042873118e-05,
+      "clip_ratio/high_mean": 9.398806923854863e-06,
+      "clip_ratio/low_mean": 4.788733849636628e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.728614519284747e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14988.0,
+      "completions/mean_length": 4976.921875,
+      "completions/mean_terminated_length": 4608.95166015625,
+      "completions/min_length": 335.0,
+      "completions/min_terminated_length": 335.0,
+      "entropy": 0.8381234556436539,
+      "epoch": 0.10579576816927323,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.0037972736172378063,
+      "learning_rate": 1e-05,
+      "loss": 0.1244,
+      "num_tokens": 85625559.0,
+      "reward": 0.4765625,
+      "reward_std": 0.39082521200180054,
+      "rewards/accuracy_reward/mean": 0.4765625,
+      "rewards/accuracy_reward/std": 0.5014128684997559,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999970555305481,
+      "sampling/importance_sampling_ratio/min": 0.002990707289427519,
+      "sampling/sampling_logp_difference/max": 5.8122453689575195,
+      "sampling/sampling_logp_difference/mean": 0.01815030723810196,
+      "step": 115
+    },
+    {
+      "clip_ratio/high_max": 4.130592969886493e-06,
+      "clip_ratio/high_mean": 1.0326482424716232e-06,
+      "clip_ratio/low_mean": 1.6904315600640984e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 1.7936963843112608e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15984.0,
+      "completions/mean_length": 6307.2421875,
+      "completions/mean_terminated_length": 6065.400390625,
+      "completions/min_length": 823.0,
+      "completions/min_terminated_length": 823.0,
+      "entropy": 1.1176434755325317,
+      "epoch": 0.10671573137074516,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0012413962977007031,
+      "learning_rate": 1e-05,
+      "loss": 0.0146,
+      "num_tokens": 86453606.0,
+      "reward": 0.28125,
+      "reward_std": 0.2280253767967224,
+      "rewards/accuracy_reward/mean": 0.28125,
+      "rewards/accuracy_reward/std": 0.4513758420944214,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000064373016357,
+      "sampling/importance_sampling_ratio/min": 0.004730688873678446,
+      "sampling/sampling_logp_difference/max": 5.353684425354004,
+      "sampling/sampling_logp_difference/mean": 0.021790307015180588,
+      "step": 116
+    },
+    {
+      "clip_ratio/high_max": 1.3160772823539446e-05,
+      "clip_ratio/high_mean": 3.2901932058848615e-06,
+      "clip_ratio/low_mean": 3.582628983167524e-05,
+      "clip_ratio/low_min": 2.61966624748311e-06,
+      "clip_ratio/region_mean": 3.911648195753514e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16329.0,
+      "completions/mean_length": 7263.1640625,
+      "completions/mean_terminated_length": 7044.26416015625,
+      "completions/min_length": 48.0,
+      "completions/min_terminated_length": 48.0,
+      "entropy": 1.107876107096672,
+      "epoch": 0.10763569457221711,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0017762042116373777,
+      "learning_rate": 1e-05,
+      "loss": 0.0349,
+      "num_tokens": 87402763.0,
+      "reward": 0.2578125,
+      "reward_std": 0.27776598930358887,
+      "rewards/accuracy_reward/mean": 0.2578125,
+      "rewards/accuracy_reward/std": 0.43914902210235596,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999741315841675,
+      "sampling/importance_sampling_ratio/min": 0.0009408573969267309,
+      "sampling/sampling_logp_difference/max": 6.968719005584717,
+      "sampling/sampling_logp_difference/mean": 0.02103034406900406,
+      "step": 117
+    },
+    {
+      "clip_ratio/high_max": 3.987745776612428e-05,
+      "clip_ratio/high_mean": 1.1877163728968299e-05,
+      "clip_ratio/low_mean": 4.26799579145154e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.455712096136267e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15416.0,
+      "completions/mean_length": 5093.859375,
+      "completions/mean_terminated_length": 4914.65087890625,
+      "completions/min_length": 364.0,
+      "completions/min_terminated_length": 364.0,
+      "entropy": 1.1065888702869415,
+      "epoch": 0.10855565777368906,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0032127038575708866,
+      "learning_rate": 1e-05,
+      "loss": 0.0194,
+      "num_tokens": 88077385.0,
+      "reward": 0.421875,
+      "reward_std": 0.345874547958374,
+      "rewards/accuracy_reward/mean": 0.421875,
+      "rewards/accuracy_reward/std": 0.4957992732524872,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999171495437622,
+      "sampling/importance_sampling_ratio/min": 7.033879228401929e-05,
+      "sampling/sampling_logp_difference/max": 9.562187194824219,
+      "sampling/sampling_logp_difference/mean": 0.020314980298280716,
+      "step": 118
+    },
+    {
+      "clip_ratio/high_max": 9.35208754526684e-06,
+      "clip_ratio/high_mean": 4.4788730519940145e-06,
+      "clip_ratio/low_mean": 3.470697703278347e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.918584917528278e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15740.0,
+      "completions/mean_length": 6943.53125,
+      "completions/mean_terminated_length": 6639.0,
+      "completions/min_length": 307.0,
+      "completions/min_terminated_length": 307.0,
+      "entropy": 0.9009081721305847,
+      "epoch": 0.10947562097516099,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0028925195802003145,
+      "learning_rate": 1e-05,
+      "loss": 0.0862,
+      "num_tokens": 88985269.0,
+      "reward": 0.3984375,
+      "reward_std": 0.3535328209400177,
+      "rewards/accuracy_reward/mean": 0.3984375,
+      "rewards/accuracy_reward/std": 0.4915000796318054,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999980628490448,
+      "sampling/importance_sampling_ratio/min": 6.553035092338177e-08,
+      "sampling/sampling_logp_difference/max": 16.540752410888672,
+      "sampling/sampling_logp_difference/mean": 0.019378282129764557,
+      "step": 119
+    },
+    {
+      "clip_ratio/high_max": 1.0939961612166371e-05,
+      "clip_ratio/high_mean": 2.734990403041593e-06,
+      "clip_ratio/low_mean": 2.4615862798782473e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.7350853201824066e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15148.0,
+      "completions/max_terminated_length": 15148.0,
+      "completions/mean_length": 4976.25,
+      "completions/mean_terminated_length": 4976.25,
+      "completions/min_length": 702.0,
+      "completions/min_terminated_length": 702.0,
+      "entropy": 0.9463540017604828,
+      "epoch": 0.11039558417663294,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0017386430408805609,
+      "learning_rate": 1e-05,
+      "loss": 0.0215,
+      "num_tokens": 89645205.0,
+      "reward": 0.359375,
+      "reward_std": 0.26462042331695557,
+      "rewards/accuracy_reward/mean": 0.359375,
+      "rewards/accuracy_reward/std": 0.481702595949173,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999554753303528,
+      "sampling/importance_sampling_ratio/min": 7.889595508459024e-06,
+      "sampling/sampling_logp_difference/max": 11.74996566772461,
+      "sampling/sampling_logp_difference/mean": 0.018035830929875374,
+      "step": 120
+    },
+    {
+      "clip_ratio/high_max": 5.941629297012696e-06,
+      "clip_ratio/high_mean": 1.485407324253174e-06,
+      "clip_ratio/low_mean": 2.6826061798601586e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.8311469009167922e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15892.0,
+      "completions/mean_length": 6439.5390625,
+      "completions/mean_terminated_length": 6281.69091796875,
+      "completions/min_length": 959.0,
+      "completions/min_terminated_length": 959.0,
+      "entropy": 0.899876207113266,
+      "epoch": 0.11131554737810488,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0037381781730800867,
+      "learning_rate": 1e-05,
+      "loss": 0.0666,
+      "num_tokens": 90489394.0,
+      "reward": 0.3203125,
+      "reward_std": 0.2624938488006592,
+      "rewards/accuracy_reward/mean": 0.3203125,
+      "rewards/accuracy_reward/std": 0.4684300124645233,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999206066131592,
+      "sampling/importance_sampling_ratio/min": 0.003606764366850257,
+      "sampling/sampling_logp_difference/max": 5.62494421005249,
+      "sampling/sampling_logp_difference/mean": 0.019368179142475128,
+      "step": 121
+    },
+    {
+      "clip_ratio/high_max": 5.189952389628161e-06,
+      "clip_ratio/high_mean": 1.2974880974070402e-06,
+      "clip_ratio/low_mean": 3.058137212974543e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.187886022715247e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15979.0,
+      "completions/mean_length": 6876.46875,
+      "completions/mean_terminated_length": 6408.884765625,
+      "completions/min_length": 4.0,
+      "completions/min_terminated_length": 4.0,
+      "entropy": 1.1018569767475128,
+      "epoch": 0.11223551057957681,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0018562980694696307,
+      "learning_rate": 1e-05,
+      "loss": 0.095,
+      "num_tokens": 91390054.0,
+      "reward": 0.21875,
+      "reward_std": 0.29955869913101196,
+      "rewards/accuracy_reward/mean": 0.21875,
+      "rewards/accuracy_reward/std": 0.41502299904823303,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999849796295166,
+      "sampling/importance_sampling_ratio/min": 2.9343695132411085e-05,
+      "sampling/sampling_logp_difference/max": 10.436432838439941,
+      "sampling/sampling_logp_difference/mean": 0.020825792104005814,
+      "step": 122
+    },
+    {
+      "clip_ratio/high_max": 2.022083435804234e-05,
+      "clip_ratio/high_mean": 5.055208589510585e-06,
+      "clip_ratio/low_mean": 3.029032552603894e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.53455343429232e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14153.0,
+      "completions/mean_length": 6501.5078125,
+      "completions/mean_terminated_length": 6344.64306640625,
+      "completions/min_length": 720.0,
+      "completions/min_terminated_length": 720.0,
+      "entropy": 1.073579266667366,
+      "epoch": 0.11315547378104876,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0016695430967956781,
+      "learning_rate": 1e-05,
+      "loss": 0.0552,
+      "num_tokens": 92241535.0,
+      "reward": 0.2734375,
+      "reward_std": 0.28641316294670105,
+      "rewards/accuracy_reward/mean": 0.2734375,
+      "rewards/accuracy_reward/std": 0.447474867105484,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998984336853027,
+      "sampling/importance_sampling_ratio/min": 0.0002380236255703494,
+      "sampling/sampling_logp_difference/max": 8.343140602111816,
+      "sampling/sampling_logp_difference/mean": 0.020438479259610176,
+      "step": 123
+    },
+    {
+      "clip_ratio/high_max": 3.3911180707946187e-06,
+      "clip_ratio/high_mean": 8.477795176986547e-07,
+      "clip_ratio/low_mean": 2.2190370486896427e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.30381500614385e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 14345.0,
+      "completions/max_terminated_length": 14345.0,
+      "completions/mean_length": 5474.1328125,
+      "completions/mean_terminated_length": 5474.1328125,
+      "completions/min_length": 14.0,
+      "completions/min_terminated_length": 14.0,
+      "entropy": 1.0692576617002487,
+      "epoch": 0.1140754369825207,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0034909825772047043,
+      "learning_rate": 1e-05,
+      "loss": 0.0,
+      "num_tokens": 92962472.0,
+      "reward": 0.3046875,
+      "reward_std": 0.27564430236816406,
+      "rewards/accuracy_reward/mean": 0.3046875,
+      "rewards/accuracy_reward/std": 0.46208351850509644,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.000006079673767,
+      "sampling/importance_sampling_ratio/min": 0.0017851731972768903,
+      "sampling/sampling_logp_difference/max": 6.328239917755127,
+      "sampling/sampling_logp_difference/mean": 0.019930578768253326,
+      "step": 124
+    },
+    {
+      "clip_ratio/high_max": 2.6292200345778838e-05,
+      "clip_ratio/high_mean": 7.620442374900449e-06,
+      "clip_ratio/low_mean": 4.615546390596137e-05,
+      "clip_ratio/low_min": 1.366510537081922e-05,
+      "clip_ratio/region_mean": 5.3775906508235494e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16198.0,
+      "completions/mean_length": 7512.078125,
+      "completions/mean_terminated_length": 7225.88671875,
+      "completions/min_length": 486.0,
+      "completions/min_terminated_length": 486.0,
+      "entropy": 0.9676955863833427,
+      "epoch": 0.11499540018399264,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0023449272848665714,
+      "learning_rate": 1e-05,
+      "loss": 0.0454,
+      "num_tokens": 93950506.0,
+      "reward": 0.3203125,
+      "reward_std": 0.22461043298244476,
+      "rewards/accuracy_reward/mean": 0.3203125,
+      "rewards/accuracy_reward/std": 0.4684300124645233,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999359250068665,
+      "sampling/importance_sampling_ratio/min": 0.0016406332142651081,
+      "sampling/sampling_logp_difference/max": 6.412672996520996,
+      "sampling/sampling_logp_difference/mean": 0.020141655579209328,
+      "step": 125
+    },
+    {
+      "clip_ratio/high_max": 5.097255780128762e-06,
+      "clip_ratio/high_mean": 1.2743139450321905e-06,
+      "clip_ratio/low_mean": 3.3802551342887455e-05,
+      "clip_ratio/low_min": 4.146762421441963e-06,
+      "clip_ratio/region_mean": 3.5076865287919645e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16183.0,
+      "completions/mean_length": 6920.484375,
+      "completions/mean_terminated_length": 6693.3603515625,
+      "completions/min_length": 962.0,
+      "completions/min_terminated_length": 962.0,
+      "entropy": 0.8662540689110756,
+      "epoch": 0.11591536338546458,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0037103090435266495,
+      "learning_rate": 1e-05,
+      "loss": 0.0617,
+      "num_tokens": 94854016.0,
+      "reward": 0.4375,
+      "reward_std": 0.322716623544693,
+      "rewards/accuracy_reward/mean": 0.4375,
+      "rewards/accuracy_reward/std": 0.49802759289741516,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999761581420898,
+      "sampling/importance_sampling_ratio/min": 0.00047686786274425685,
+      "sampling/sampling_logp_difference/max": 7.648271083831787,
+      "sampling/sampling_logp_difference/mean": 0.01915796287357807,
+      "step": 126
+    },
+    {
+      "clip_ratio/high_max": 8.4922439782531e-06,
+      "clip_ratio/high_mean": 2.123060994563275e-06,
+      "clip_ratio/low_mean": 5.024227584726759e-05,
+      "clip_ratio/low_min": 1.3627016414829995e-05,
+      "clip_ratio/region_mean": 5.236533706920454e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15867.0,
+      "completions/mean_length": 7939.609375,
+      "completions/mean_terminated_length": 7805.57177734375,
+      "completions/min_length": 1260.0,
+      "completions/min_terminated_length": 1260.0,
+      "entropy": 0.9707008600234985,
+      "epoch": 0.11683532658693652,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0024642283096909523,
+      "learning_rate": 1e-05,
+      "loss": 0.0788,
+      "num_tokens": 95889966.0,
+      "reward": 0.2265625,
+      "reward_std": 0.27434611320495605,
+      "rewards/accuracy_reward/mean": 0.2265625,
+      "rewards/accuracy_reward/std": 0.4202519655227661,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998771548271179,
+      "sampling/importance_sampling_ratio/min": 4.540014560916461e-05,
+      "sampling/sampling_logp_difference/max": 9.999995231628418,
+      "sampling/sampling_logp_difference/mean": 0.020453302189707756,
+      "step": 127
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 3.766829564710861e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.766829564710861e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14969.0,
+      "completions/mean_length": 5985.8203125,
+      "completions/mean_terminated_length": 5474.43408203125,
+      "completions/min_length": 315.0,
+      "completions/min_terminated_length": 315.0,
+      "entropy": 0.9083090648055077,
+      "epoch": 0.11775528978840846,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.003317479742690921,
+      "learning_rate": 1e-05,
+      "loss": 0.0537,
+      "num_tokens": 96676847.0,
+      "reward": 0.3671875,
+      "reward_std": 0.287486732006073,
+      "rewards/accuracy_reward/mean": 0.3671875,
+      "rewards/accuracy_reward/std": 0.4839322865009308,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999130964279175,
+      "sampling/importance_sampling_ratio/min": 0.000286750087980181,
+      "sampling/sampling_logp_difference/max": 8.156899452209473,
+      "sampling/sampling_logp_difference/mean": 0.01996719278395176,
+      "step": 128
+    }
+  ],
+  "logging_steps": 1,
+  "max_steps": 1024,
+  "num_input_tokens_seen": 96676847,
+  "num_train_epochs": 1,
+  "save_steps": 64,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 0.0,
+  "train_batch_size": 4,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/dapo_lora_plus_20251202_001141/checkpoint-128/zero_to_fp32.py b/dapo_lora_plus_20251202_001141/checkpoint-128/zero_to_fp32.py
new file mode 100644
index 0000000000000000000000000000000000000000..5995d6e6f04e43b989587aa9022a3aef0c66d694
--- /dev/null
+++ b/dapo_lora_plus_20251202_001141/checkpoint-128/zero_to_fp32.py
@@ -0,0 +1,760 @@
+#!/usr/bin/env python
+
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets
+# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in
+# the future. Once extracted, the weights don't require DeepSpeed and can be used in any
+# application.
+#
+# example:
+#   python zero_to_fp32.py . output_dir/
+#   or
+#   python zero_to_fp32.py . output_dir/ --safe_serialization
+
+import argparse
+import torch
+import glob
+import math
+import os
+import re
+import gc
+import json
+import numpy as np
+from tqdm import tqdm
+from collections import OrderedDict
+from dataclasses import dataclass
+
+# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with
+# DeepSpeed data structures it has to be available in the current python environment.
+from deepspeed.utils import logger
+from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS,
+                                            FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES,
+                                            FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS)
+
+
+@dataclass
+class zero_model_state:
+    buffers: dict()
+    param_shapes: dict()
+    shared_params: list
+    ds_version: int
+    frozen_param_shapes: dict()
+    frozen_param_fragments: dict()
+
+
+debug = 0
+
+# load to cpu
+device = torch.device('cpu')
+
+
+def atoi(text):
+    return int(text) if text.isdigit() else text
+
+
+def natural_keys(text):
+    '''
+    alist.sort(key=natural_keys) sorts in human order
+    http://nedbatchelder.com/blog/200712/human_sorting.html
+    (See Toothy's implementation in the comments)
+    '''
+    return [atoi(c) for c in re.split(r'(\d+)', text)]
+
+
+def get_model_state_file(checkpoint_dir, zero_stage):
+    if not os.path.isdir(checkpoint_dir):
+        raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist")
+
+    # there should be only one file
+    if zero_stage <= 2:
+        file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt")
+    elif zero_stage == 3:
+        file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt")
+
+    if not os.path.exists(file):
+        raise FileNotFoundError(f"can't find model states file at '{file}'")
+
+    return file
+
+
+def get_checkpoint_files(checkpoint_dir, glob_pattern):
+    # XXX: need to test that this simple glob rule works for multi-node setup too
+    ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys)
+
+    if len(ckpt_files) == 0:
+        raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'")
+
+    return ckpt_files
+
+
+def get_optim_files(checkpoint_dir):
+    return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt")
+
+
+def get_model_state_files(checkpoint_dir):
+    return get_checkpoint_files(checkpoint_dir, "*_model_states.pt")
+
+
+def parse_model_states(files):
+    zero_model_states = []
+    for file in files:
+        state_dict = torch.load(file, map_location=device, weights_only=False)
+
+        if BUFFER_NAMES not in state_dict:
+            raise ValueError(f"{file} is not a model state checkpoint")
+        buffer_names = state_dict[BUFFER_NAMES]
+        if debug:
+            print("Found buffers:", buffer_names)
+
+        # recover just the buffers while restoring them to fp32 if they were saved in fp16
+        buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names}
+        param_shapes = state_dict[PARAM_SHAPES]
+
+        # collect parameters that are included in param_shapes
+        param_names = []
+        for s in param_shapes:
+            for name in s.keys():
+                param_names.append(name)
+
+        # update with frozen parameters
+        frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None)
+        if frozen_param_shapes is not None:
+            if debug:
+                print(f"Found frozen_param_shapes: {frozen_param_shapes}")
+            param_names += list(frozen_param_shapes.keys())
+
+        # handle shared params
+        shared_params = [[k, v] for k, v in state_dict["shared_params"].items()]
+
+        ds_version = state_dict.get(DS_VERSION, None)
+
+        frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None)
+
+        z_model_state = zero_model_state(buffers=buffers,
+                                         param_shapes=param_shapes,
+                                         shared_params=shared_params,
+                                         ds_version=ds_version,
+                                         frozen_param_shapes=frozen_param_shapes,
+                                         frozen_param_fragments=frozen_param_fragments)
+        zero_model_states.append(z_model_state)
+
+    return zero_model_states
+
+
+def parse_optim_states(files, ds_checkpoint_dir):
+    total_files = len(files)
+    state_dicts = []
+    for f in tqdm(files, desc='Loading checkpoint shards'):
+        state_dict = torch.load(f, map_location=device, mmap=True, weights_only=False)
+        # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights
+        # and also handle the case where it was already removed by another helper script
+        state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None)
+        state_dicts.append(state_dict)
+
+    if ZERO_STAGE not in state_dicts[0][OPTIMIZER_STATE_DICT]:
+        raise ValueError(f"{files[0]} is not a zero checkpoint")
+    zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE]
+    world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT]
+
+    # For ZeRO-2 each param group can have different partition_count as data parallelism for expert
+    # parameters can be different from data parallelism for non-expert parameters. So we can just
+    # use the max of the partition_count to get the dp world_size.
+
+    if type(world_size) is list:
+        world_size = max(world_size)
+
+    if world_size != total_files:
+        raise ValueError(
+            f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. "
+            "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes."
+        )
+
+    # the groups are named differently in each stage
+    if zero_stage <= 2:
+        fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS
+    elif zero_stage == 3:
+        fp32_groups_key = FP32_FLAT_GROUPS
+    else:
+        raise ValueError(f"unknown zero stage {zero_stage}")
+
+    fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))]
+    return zero_stage, world_size, fp32_flat_groups
+
+
+def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters):
+    """
+    Returns fp32 state_dict reconstructed from ds checkpoint
+
+    Args:
+        - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are)
+
+    """
+    print(f"Processing zero checkpoint '{ds_checkpoint_dir}'")
+
+    optim_files = get_optim_files(ds_checkpoint_dir)
+    zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir)
+    print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}")
+
+    model_files = get_model_state_files(ds_checkpoint_dir)
+
+    zero_model_states = parse_model_states(model_files)
+    print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}')
+
+    if zero_stage <= 2:
+        return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+                                                          exclude_frozen_parameters)
+    elif zero_stage == 3:
+        return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+                                                          exclude_frozen_parameters)
+
+
+def _zero2_merge_frozen_params(state_dict, zero_model_states):
+    if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0:
+        return
+
+    frozen_param_shapes = zero_model_states[0].frozen_param_shapes
+    frozen_param_fragments = zero_model_states[0].frozen_param_fragments
+
+    if debug:
+        num_elem = sum(s.numel() for s in frozen_param_shapes.values())
+        print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}')
+
+        wanted_params = len(frozen_param_shapes)
+        wanted_numel = sum(s.numel() for s in frozen_param_shapes.values())
+        avail_numel = sum([p.numel() for p in frozen_param_fragments.values()])
+        print(f'Frozen params: Have {avail_numel} numels to process.')
+        print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params')
+
+    total_params = 0
+    total_numel = 0
+    for name, shape in frozen_param_shapes.items():
+        total_params += 1
+        unpartitioned_numel = shape.numel()
+        total_numel += unpartitioned_numel
+
+        state_dict[name] = frozen_param_fragments[name]
+
+        if debug:
+            print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ")
+
+    print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements")
+
+
+def _has_callable(obj, fn):
+    attr = getattr(obj, fn, None)
+    return callable(attr)
+
+
+def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states):
+    param_shapes = zero_model_states[0].param_shapes
+
+    # Reconstruction protocol:
+    #
+    # XXX: document this
+
+    if debug:
+        for i in range(world_size):
+            for j in range(len(fp32_flat_groups[0])):
+                print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}")
+
+    # XXX: memory usage doubles here (zero2)
+    num_param_groups = len(fp32_flat_groups[0])
+    merged_single_partition_of_fp32_groups = []
+    for i in range(num_param_groups):
+        merged_partitions = [sd[i] for sd in fp32_flat_groups]
+        full_single_fp32_vector = torch.cat(merged_partitions, 0)
+        merged_single_partition_of_fp32_groups.append(full_single_fp32_vector)
+    avail_numel = sum(
+        [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups])
+
+    if debug:
+        wanted_params = sum([len(shapes) for shapes in param_shapes])
+        wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes])
+        # not asserting if there is a mismatch due to possible padding
+        print(f"Have {avail_numel} numels to process.")
+        print(f"Need {wanted_numel} numels in {wanted_params} params.")
+
+    # params
+    # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support
+    # out-of-core computing solution
+    total_numel = 0
+    total_params = 0
+    for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups):
+        offset = 0
+        avail_numel = full_single_fp32_vector.numel()
+        for name, shape in shapes.items():
+
+            unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape)
+            total_numel += unpartitioned_numel
+            total_params += 1
+
+            if debug:
+                print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ")
+            state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape)
+            offset += unpartitioned_numel
+
+        # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and
+        # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex
+        # paddings performed in the code it's almost impossible to predict the exact numbers w/o the
+        # live optimizer object, so we are checking that the numbers are within the right range
+        align_to = 2 * world_size
+
+        def zero2_align(x):
+            return align_to * math.ceil(x / align_to)
+
+        if debug:
+            print(f"original offset={offset}, avail_numel={avail_numel}")
+
+        offset = zero2_align(offset)
+        avail_numel = zero2_align(avail_numel)
+
+        if debug:
+            print(f"aligned  offset={offset}, avail_numel={avail_numel}")
+
+        # Sanity check
+        if offset != avail_numel:
+            raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong")
+
+    print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements")
+
+
+def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+                                               exclude_frozen_parameters):
+    state_dict = OrderedDict()
+
+    # buffers
+    buffers = zero_model_states[0].buffers
+    state_dict.update(buffers)
+    if debug:
+        print(f"added {len(buffers)} buffers")
+
+    if not exclude_frozen_parameters:
+        _zero2_merge_frozen_params(state_dict, zero_model_states)
+
+    _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states)
+
+    # recover shared parameters
+    for pair in zero_model_states[0].shared_params:
+        if pair[1] in state_dict:
+            state_dict[pair[0]] = state_dict[pair[1]]
+
+    return state_dict
+
+
+def zero3_partitioned_param_info(unpartitioned_numel, world_size):
+    remainder = unpartitioned_numel % world_size
+    padding_numel = (world_size - remainder) if remainder else 0
+    partitioned_numel = math.ceil(unpartitioned_numel / world_size)
+    return partitioned_numel, padding_numel
+
+
+def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states):
+    if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0:
+        return
+
+    if debug:
+        for i in range(world_size):
+            num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values())
+            print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}')
+
+        frozen_param_shapes = zero_model_states[0].frozen_param_shapes
+        wanted_params = len(frozen_param_shapes)
+        wanted_numel = sum(s.numel() for s in frozen_param_shapes.values())
+        avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size
+        print(f'Frozen params: Have {avail_numel} numels to process.')
+        print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params')
+
+    total_params = 0
+    total_numel = 0
+    for name, shape in zero_model_states[0].frozen_param_shapes.items():
+        total_params += 1
+        unpartitioned_numel = shape.numel()
+        total_numel += unpartitioned_numel
+
+        param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states)
+        state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape)
+
+        partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size)
+
+        if debug:
+            print(
+                f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}"
+            )
+
+    print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements")
+
+
+class GatheredTensor:
+    """
+    A pseudo tensor that collects partitioned weights.
+    It is more memory efficient when there are multiple groups.
+    """
+
+    def __init__(self, flat_groups, flat_groups_offset, offset, partitioned_numel, shape):
+        self.flat_groups = flat_groups
+        self.flat_groups_offset = flat_groups_offset
+        self.offset = offset
+        self.partitioned_numel = partitioned_numel
+        self.shape = shape
+        self.dtype = self.flat_groups[0][0].dtype
+
+    def contiguous(self):
+        """
+        Merge partitioned weights from flat_groups into a single tensor.
+        """
+        end_idx = self.offset + self.partitioned_numel
+        world_size = len(self.flat_groups)
+        pad_flat_param_chunks = []
+
+        for rank_i in range(world_size):
+            # for each rank, we need to collect weights from related group/groups
+            flat_groups_at_rank_i = self.flat_groups[rank_i]
+            start_group_id = None
+            end_group_id = None
+            for group_id in range(len(self.flat_groups_offset)):
+                if self.flat_groups_offset[group_id] <= self.offset < self.flat_groups_offset[group_id + 1]:
+                    start_group_id = group_id
+                if self.flat_groups_offset[group_id] < end_idx <= self.flat_groups_offset[group_id + 1]:
+                    end_group_id = group_id
+                    break
+            # collect weights from related group/groups
+            for group_id in range(start_group_id, end_group_id + 1):
+                flat_tensor = flat_groups_at_rank_i[group_id]
+                start_offset = self.offset - self.flat_groups_offset[group_id]
+                end_offset = min(end_idx, self.flat_groups_offset[group_id + 1]) - self.flat_groups_offset[group_id]
+                pad_flat_param_chunks.append(flat_tensor[start_offset:end_offset])
+
+        # collect weights from all ranks
+        pad_flat_param = torch.cat(pad_flat_param_chunks, dim=0)
+        param = pad_flat_param[:self.shape.numel()].view(self.shape).contiguous()
+        return param
+
+
+def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states):
+    param_shapes = zero_model_states[0].param_shapes
+    avail_numel = sum([flat_group.numel() for flat_group in fp32_flat_groups[0]]) * world_size
+
+    # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each
+    # param, re-consolidating each param, while dealing with padding if any
+
+    # merge list of dicts, preserving order
+    param_shapes = {k: v for d in param_shapes for k, v in d.items()}
+
+    if debug:
+        for i in range(world_size):
+            print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}")
+
+        wanted_params = len(param_shapes)
+        wanted_numel = sum(shape.numel() for shape in param_shapes.values())
+        # not asserting if there is a mismatch due to possible padding
+        avail_numel = fp32_flat_groups[0].numel() * world_size
+        print(f"Trainable params: Have {avail_numel} numels to process.")
+        print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.")
+
+    # params
+    # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support
+    # out-of-core computing solution
+    offset = 0
+    total_numel = 0
+    total_params = 0
+    flat_groups_offset = [0] + list(np.cumsum([flat_tensor.numel() for flat_tensor in fp32_flat_groups[0]]))
+    for name, shape in tqdm(param_shapes.items(), desc='Gathering sharded weights'):
+        unpartitioned_numel = shape.numel()
+        total_numel += unpartitioned_numel
+        total_params += 1
+        partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size)
+
+        if debug:
+            print(
+                f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}"
+            )
+
+        # memory efficient tensor
+        tensor = GatheredTensor(fp32_flat_groups, flat_groups_offset, offset, partitioned_numel, shape)
+        state_dict[name] = tensor
+        offset += partitioned_numel
+
+    offset *= world_size
+
+    # Sanity check
+    if offset != avail_numel:
+        raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong")
+
+    print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements")
+
+
+def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+                                               exclude_frozen_parameters):
+    state_dict = OrderedDict()
+
+    # buffers
+    buffers = zero_model_states[0].buffers
+    state_dict.update(buffers)
+    if debug:
+        print(f"added {len(buffers)} buffers")
+
+    if not exclude_frozen_parameters:
+        _zero3_merge_frozen_params(state_dict, world_size, zero_model_states)
+
+    _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states)
+
+    # recover shared parameters
+    for pair in zero_model_states[0].shared_params:
+        if pair[1] in state_dict:
+            state_dict[pair[0]] = state_dict[pair[1]]
+
+    return state_dict
+
+
+def to_torch_tensor(state_dict, return_empty_tensor=False):
+    """
+    Convert state_dict of GatheredTensor to torch tensor
+    """
+    torch_state_dict = {}
+    converted_tensors = {}
+    for name, tensor in state_dict.items():
+        tensor_id = id(tensor)
+        if tensor_id in converted_tensors:  # shared tensors
+            shared_tensor = torch_state_dict[converted_tensors[tensor_id]]
+            torch_state_dict[name] = shared_tensor
+        else:
+            converted_tensors[tensor_id] = name
+            if return_empty_tensor:
+                torch_state_dict[name] = torch.empty(tensor.shape, dtype=tensor.dtype)
+            else:
+                torch_state_dict[name] = tensor.contiguous()
+    return torch_state_dict
+
+
+def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir,
+                                             tag=None,
+                                             exclude_frozen_parameters=False,
+                                             lazy_mode=False):
+    """
+    Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with
+    ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example
+    via a model hub.
+
+    Args:
+        - ``checkpoint_dir``: path to the desired checkpoint folder
+        - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14``
+        - ``exclude_frozen_parameters``: exclude frozen parameters
+        - ``lazy_mode``: get state_dict in lazy mode. It returns a dict of pesduo tensor instead of torch tensor, which is more memory efficient.
+          Convert the pesduo tensor to torch tensor by ``.contiguous()``
+
+    Returns:
+        - pytorch ``state_dict``
+
+    A typical usage might be ::
+
+        from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint
+        # do the training and checkpoint saving
+        state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu
+        model = model.cpu() # move to cpu
+        model.load_state_dict(state_dict)
+        # submit to model hub or save the model to share with others
+
+    In this example the ``model`` will no longer be usable in the deepspeed context of the same
+    application. i.e. you will need to re-initialize the deepspeed engine, since
+    ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it.
+
+    If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead.
+
+    Note: the above usage may not work if your application doesn't have sufficient free CPU memory.
+    You may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with
+    the checkpoint. Or you can load state_dict in lazy mode ::
+
+        from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint
+        state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, lazy_mode=True) # not on cpu
+        for name, lazy_tensor in state_dict.item():
+            tensor = lazy_tensor.contiguous()  # to cpu
+            print(name, tensor)
+            # del tensor to release memory if it no longer in use
+    """
+    if tag is None:
+        latest_path = os.path.join(checkpoint_dir, 'latest')
+        if os.path.isfile(latest_path):
+            with open(latest_path, 'r') as fd:
+                tag = fd.read().strip()
+        else:
+            raise ValueError(f"Unable to find 'latest' file at {latest_path}")
+
+    ds_checkpoint_dir = os.path.join(checkpoint_dir, tag)
+
+    if not os.path.isdir(ds_checkpoint_dir):
+        raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist")
+
+    state_dict = _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters)
+    if lazy_mode:
+        return state_dict
+    else:
+        return to_torch_tensor(state_dict)
+
+
+def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir,
+                                               output_dir,
+                                               max_shard_size="5GB",
+                                               safe_serialization=False,
+                                               tag=None,
+                                               exclude_frozen_parameters=False):
+    """
+    Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be
+    loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed.
+
+    Args:
+        - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``)
+        - ``output_dir``: directory to the pytorch fp32 state_dict output files
+        - ``max_shard_size``: the maximum size for a checkpoint before being sharded, default value is 5GB
+        - ``safe_serialization``:  whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).
+        - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14``
+        - ``exclude_frozen_parameters``: exclude frozen parameters
+    """
+
+    # Dependency pre-check
+    if safe_serialization:
+        try:
+            from safetensors.torch import save_file
+        except ImportError:
+            print('If you want to use `safe_serialization`, please `pip install safetensors`')
+            raise
+    if max_shard_size is not None:
+        try:
+            from huggingface_hub import split_torch_state_dict_into_shards
+        except ImportError:
+            print('If you want to use `max_shard_size`, please `pip install huggingface_hub`')
+            raise
+
+    # Convert zero checkpoint to state_dict
+    state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir,
+                                                          tag,
+                                                          exclude_frozen_parameters,
+                                                          lazy_mode=True)
+
+    # Shard the model if it is too big.
+    weights_name = "model.safetensors" if safe_serialization else "pytorch_model.bin"
+    if max_shard_size is not None:
+        filename_pattern = weights_name.replace(".bin", "{suffix}.bin").replace(".safetensors", "{suffix}.safetensors")
+        # an memory-efficient approach for sharding
+        empty_state_dict = to_torch_tensor(state_dict, return_empty_tensor=True)
+        state_dict_split = split_torch_state_dict_into_shards(empty_state_dict,
+                                                              filename_pattern=filename_pattern,
+                                                              max_shard_size=max_shard_size)
+    else:
+        from collections import namedtuple
+        StateDictSplit = namedtuple("StateDictSplit", ["is_sharded", "filename_to_tensors"])
+        state_dict_split = StateDictSplit(is_sharded=False,
+                                          filename_to_tensors={weights_name: list(state_dict.keys())})
+
+    # Save the model by shard
+    os.makedirs(output_dir, exist_ok=True)
+    filename_to_tensors = state_dict_split.filename_to_tensors.items()
+    for shard_file, tensors in tqdm(filename_to_tensors, desc="Saving checkpoint shards"):
+        shard_state_dict = {tensor_name: state_dict[tensor_name] for tensor_name in tensors}
+        shard_state_dict = to_torch_tensor(shard_state_dict)
+        output_path = os.path.join(output_dir, shard_file)
+        if safe_serialization:
+            save_file(shard_state_dict, output_path, metadata={"format": "pt"})
+        else:
+            torch.save(shard_state_dict, output_path)
+        # release the memory of current shard
+        for tensor_name in list(shard_state_dict.keys()):
+            del state_dict[tensor_name]
+            del shard_state_dict[tensor_name]
+        del shard_state_dict
+        gc.collect()
+
+    # Save index if sharded
+    if state_dict_split.is_sharded:
+        index = {
+            "metadata": state_dict_split.metadata,
+            "weight_map": state_dict_split.tensor_to_filename,
+        }
+        save_index_file = "model.safetensors.index.json" if safe_serialization else "pytorch_model.bin.index.json"
+        save_index_file = os.path.join(output_dir, save_index_file)
+        with open(save_index_file, "w", encoding="utf-8") as f:
+            content = json.dumps(index, indent=2, sort_keys=True) + "\n"
+            f.write(content)
+
+
+def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None):
+    """
+    1. Put the provided model to cpu
+    2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict``
+    3. Load it into the provided model
+
+    Args:
+        - ``model``: the model object to update
+        - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``)
+        - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14``
+
+    Returns:
+        - ``model`: modified model
+
+    Make sure you have plenty of CPU memory available before you call this function. If you don't
+    have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it
+    conveniently placed for you in the checkpoint folder.
+
+    A typical usage might be ::
+
+        from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint
+        model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir)
+        # submit to model hub or save the model to share with others
+
+    Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context
+    of the same application. i.e. you will need to re-initialize the deepspeed engine, since
+    ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it.
+
+    """
+    logger.info("Extracting fp32 weights")
+    state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag)
+
+    logger.info("Overwriting model with fp32 weights")
+    model = model.cpu()
+    model.load_state_dict(state_dict, strict=False)
+
+    return model
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("checkpoint_dir",
+                        type=str,
+                        help="path to the desired checkpoint folder, e.g., path/checkpoint-12")
+    parser.add_argument("output_dir",
+                        type=str,
+                        help="directory to the pytorch fp32 state_dict output files"
+                        "(e.g. path/checkpoint-12-output/)")
+    parser.add_argument(
+        "--max_shard_size",
+        type=str,
+        default="5GB",
+        help="The maximum size for a checkpoint before being sharded. Checkpoints shard will then be each of size"
+        "lower than this size. If expressed as a string, needs to be digits followed by a unit (like `5MB`"
+        "We default it to 5GB in order for models to be able to run easily on free-tier google colab instances"
+        "without CPU OOM issues.")
+    parser.add_argument(
+        "--safe_serialization",
+        default=False,
+        action='store_true',
+        help="Whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).")
+    parser.add_argument("-t",
+                        "--tag",
+                        type=str,
+                        default=None,
+                        help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1")
+    parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters")
+    parser.add_argument("-d", "--debug", action='store_true', help="enable debug")
+    args = parser.parse_args()
+
+    debug = args.debug
+
+    convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir,
+                                               args.output_dir,
+                                               max_shard_size=args.max_shard_size,
+                                               safe_serialization=args.safe_serialization,
+                                               tag=args.tag,
+                                               exclude_frozen_parameters=args.exclude_frozen_parameters)
diff --git a/dapo_lora_plus_20251202_001141/checkpoint-192/README.md b/dapo_lora_plus_20251202_001141/checkpoint-192/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..b3fac4aca7a7fabb3a0972e6c9281e23853e2816
--- /dev/null
+++ b/dapo_lora_plus_20251202_001141/checkpoint-192/README.md
@@ -0,0 +1,209 @@
+---
+base_model: deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B
+- grpo
+- lora
+- transformers
+- trl
+---
+
+# Model Card for Model ID
+
+<!-- Provide a quick summary of what the model is/does. -->
+
+
+
+## Model Details
+
+### Model Description
+
+<!-- Provide a longer summary of what this model is. -->
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+<!-- Provide the basic links for the model. -->
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+
+### Direct Use
+
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+
+[More Information Needed]
+
+### Recommendations
+
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+
+[More Information Needed]
+
+### Training Procedure
+
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+
+#### Speeds, Sizes, Times [optional]
+
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+
+[More Information Needed]
+
+## Evaluation
+
+<!-- This section describes the evaluation protocols and provides the results. -->
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+<!-- This should link to a Dataset Card if possible. -->
+
+[More Information Needed]
+
+#### Factors
+
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+
+[More Information Needed]
+
+#### Metrics
+
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+<!-- Relevant interpretability work for the model goes here -->
+
+[More Information Needed]
+
+## Environmental Impact
+
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.17.1
\ No newline at end of file
diff --git a/dapo_lora_plus_20251202_001141/checkpoint-192/adapter_config.json b/dapo_lora_plus_20251202_001141/checkpoint-192/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..57b1340e85011632bb78b2fd3b13b455f6b0d622
--- /dev/null
+++ b/dapo_lora_plus_20251202_001141/checkpoint-192/adapter_config.json
@@ -0,0 +1,42 @@
+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B",
+  "bias": "none",
+  "corda_config": null,
+  "eva_config": null,
+  "exclude_modules": null,
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 64,
+  "lora_bias": false,
+  "lora_dropout": 0.05,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "qalora_group_size": 16,
+  "r": 32,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "q_proj",
+    "v_proj",
+    "k_proj",
+    "gate_proj",
+    "down_proj",
+    "up_proj",
+    "o_proj"
+  ],
+  "target_parameters": null,
+  "task_type": "CAUSAL_LM",
+  "trainable_token_indices": null,
+  "use_dora": false,
+  "use_qalora": false,
+  "use_rslora": false
+}
\ No newline at end of file
diff --git a/dapo_lora_plus_20251202_001141/checkpoint-192/chat_template.jinja b/dapo_lora_plus_20251202_001141/checkpoint-192/chat_template.jinja
new file mode 100644
index 0000000000000000000000000000000000000000..c2066bd7391c270626e39c9d7124f00360126412
--- /dev/null
+++ b/dapo_lora_plus_20251202_001141/checkpoint-192/chat_template.jinja
@@ -0,0 +1 @@
+{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% set ns = namespace(is_first=false, is_tool=false, is_output_first=true, system_prompt='') %}{%- for message in messages %}{%- if message['role'] == 'system' %}{% set ns.system_prompt = message['content'] %}{%- endif %}{%- endfor %}{{bos_token}}{{ns.system_prompt}}{%- for message in messages %}{%- if message['role'] == 'user' %}{%- set ns.is_tool = false -%}{{'<｜User｜>' + message['content']}}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is none %}{%- set ns.is_tool = false -%}{%- for tool in message['tool_calls']%}{%- if not ns.is_first %}{{'<｜Assistant｜><｜tool▁calls▁begin｜><｜tool▁call▁begin｜>' + tool['type'] + '<｜tool▁sep｜>' + tool['function']['name'] + '\n' + '```json' + '\n' + tool['function']['arguments'] + '\n' + '```' + '<｜tool▁call▁end｜>'}}{%- set ns.is_first = true -%}{%- else %}{{'\n' + '<｜tool▁call▁begin｜>' + tool['type'] + '<｜tool▁sep｜>' + tool['function']['name'] + '\n' + '```json' + '\n' + tool['function']['arguments'] + '\n' + '```' + '<｜tool▁call▁end｜>'}}{{'<｜tool▁calls▁end｜><｜end▁of▁sentence｜>'}}{%- endif %}{%- endfor %}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is not none %}{%- if ns.is_tool %}{{'<｜tool▁outputs▁end｜>' + message['content'] + '<｜end▁of▁sentence｜>'}}{%- set ns.is_tool = false -%}{%- else %}{% set content = message['content'] %}{% if '</think>' in content %}{% set content = content.split('</think>')[-1] %}{% endif %}{{'<｜Assistant｜>' + content + '<｜end▁of▁sentence｜>'}}{%- endif %}{%- endif %}{%- if message['role'] == 'tool' %}{%- set ns.is_tool = true -%}{%- if ns.is_output_first %}{{'<｜tool▁outputs▁begin｜><｜tool▁output▁begin｜>' + message['content'] + '<｜tool▁output▁end｜>'}}{%- set ns.is_output_first = false %}{%- else %}{{'\n<｜tool▁output▁begin｜>' + message['content'] + '<｜tool▁output▁end｜>'}}{%- endif %}{%- endif %}{%- endfor -%}{% if ns.is_tool %}{{'<｜tool▁outputs▁end｜>'}}{% endif %}{% if add_generation_prompt and not ns.is_tool %}{{'<｜Assistant｜><think>\n'}}{% endif %}
\ No newline at end of file
diff --git a/dapo_lora_plus_20251202_001141/checkpoint-192/latest b/dapo_lora_plus_20251202_001141/checkpoint-192/latest
new file mode 100644
index 0000000000000000000000000000000000000000..36721df7ef9c6f050f37be6e76b3d130ed5cbfc7
--- /dev/null
+++ b/dapo_lora_plus_20251202_001141/checkpoint-192/latest
@@ -0,0 +1 @@
+global_step192
\ No newline at end of file
diff --git a/dapo_lora_plus_20251202_001141/checkpoint-192/special_tokens_map.json b/dapo_lora_plus_20251202_001141/checkpoint-192/special_tokens_map.json
new file mode 100644
index 0000000000000000000000000000000000000000..1d385d62cf08bca35254547902b792c243656ec1
--- /dev/null
+++ b/dapo_lora_plus_20251202_001141/checkpoint-192/special_tokens_map.json
@@ -0,0 +1,23 @@
+{
+  "bos_token": {
+    "content": "<｜begin▁of▁sentence｜>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "<｜end▁of▁sentence｜>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<｜end▁of▁sentence｜>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}
diff --git a/dapo_lora_plus_20251202_001141/checkpoint-192/tokenizer_config.json b/dapo_lora_plus_20251202_001141/checkpoint-192/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..d252dd4e5764106823080946500c02a8ed8c90c9
--- /dev/null
+++ b/dapo_lora_plus_20251202_001141/checkpoint-192/tokenizer_config.json
@@ -0,0 +1,194 @@
+{
+  "add_bos_token": true,
+  "add_eos_token": false,
+  "add_prefix_space": null,
+  "added_tokens_decoder": {
+    "151643": {
+      "content": "<｜end▁of▁sentence｜>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151644": {
+      "content": "<｜User｜>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151645": {
+      "content": "<｜Assistant｜>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151646": {
+      "content": "<｜begin▁of▁sentence｜>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151647": {
+      "content": "<|EOT|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151648": {
+      "content": "<think>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151649": {
+      "content": "</think>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151650": {
+      "content": "<|quad_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151651": {
+      "content": "<|quad_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151652": {
+      "content": "<|vision_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151653": {
+      "content": "<|vision_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151654": {
+      "content": "<|vision_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151655": {
+      "content": "<|image_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151656": {
+      "content": "<|video_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151657": {
+      "content": "<tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151658": {
+      "content": "</tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151659": {
+      "content": "<|fim_prefix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151660": {
+      "content": "<|fim_middle|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151661": {
+      "content": "<|fim_suffix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151662": {
+      "content": "<|fim_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151663": {
+      "content": "<|repo_name|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151664": {
+      "content": "<|file_sep|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    }
+  },
+  "bos_token": "<｜begin▁of▁sentence｜>",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<｜end▁of▁sentence｜>",
+  "extra_special_tokens": {},
+  "legacy": true,
+  "model_max_length": 16384,
+  "pad_token": "<｜end▁of▁sentence｜>",
+  "sp_model_kwargs": {},
+  "tokenizer_class": "LlamaTokenizerFast",
+  "unk_token": null,
+  "use_default_system_prompt": false
+}
diff --git a/dapo_lora_plus_20251202_001141/checkpoint-192/trainer_state.json b/dapo_lora_plus_20251202_001141/checkpoint-192/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..86474473dc82b1cdb8c5cd9c25cfca00610f917a
--- /dev/null
+++ b/dapo_lora_plus_20251202_001141/checkpoint-192/trainer_state.json
@@ -0,0 +1,5986 @@
+{
+  "best_global_step": null,
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 0.1766329346826127,
+  "eval_steps": 500,
+  "global_step": 192,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15689.0,
+      "completions/max_terminated_length": 15689.0,
+      "completions/mean_length": 6039.171875,
+      "completions/mean_terminated_length": 6039.171875,
+      "completions/min_length": 250.0,
+      "completions/min_terminated_length": 250.0,
+      "entropy": 1.19118632376194,
+      "epoch": 0.0009199632014719411,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0025745572056621313,
+      "learning_rate": 1e-05,
+      "loss": 0.0591,
+      "num_tokens": 792270.0,
+      "reward": 0.25,
+      "reward_std": 0.24435341358184814,
+      "rewards/accuracy_reward/mean": 0.25,
+      "rewards/accuracy_reward/std": 0.434714138507843,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999940395355225,
+      "sampling/importance_sampling_ratio/min": 0.0002457273658365011,
+      "sampling/sampling_logp_difference/max": 8.311287879943848,
+      "sampling/sampling_logp_difference/mean": 0.021642697975039482,
+      "step": 1
+    },
+    {
+      "clip_ratio/high_max": 5.499582130141789e-06,
+      "clip_ratio/high_mean": 1.3748955325354473e-06,
+      "clip_ratio/low_mean": 2.871888784738985e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.009378326623846e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 16292.0,
+      "completions/max_terminated_length": 16292.0,
+      "completions/mean_length": 4767.1875,
+      "completions/mean_terminated_length": 4767.1875,
+      "completions/min_length": 556.0,
+      "completions/min_terminated_length": 556.0,
+      "entropy": 1.088237851858139,
+      "epoch": 0.0018399264029438822,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.002068034838885069,
+      "learning_rate": 1e-05,
+      "loss": 0.0258,
+      "num_tokens": 1425798.0,
+      "reward": 0.3046875,
+      "reward_std": 0.29036980867385864,
+      "rewards/accuracy_reward/mean": 0.3046875,
+      "rewards/accuracy_reward/std": 0.46208351850509644,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999016523361206,
+      "sampling/importance_sampling_ratio/min": 0.01811397261917591,
+      "sampling/sampling_logp_difference/max": 4.011071681976318,
+      "sampling/sampling_logp_difference/mean": 0.01877593621611595,
+      "step": 2
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 4.459846724103045e-05,
+      "clip_ratio/low_min": 3.4060874440910993e-06,
+      "clip_ratio/region_mean": 4.459846724103045e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16317.0,
+      "completions/mean_length": 6586.359375,
+      "completions/mean_terminated_length": 6351.21630859375,
+      "completions/min_length": 4.0,
+      "completions/min_terminated_length": 4.0,
+      "entropy": 1.0497623533010483,
+      "epoch": 0.0027598896044158236,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.001971944235265255,
+      "learning_rate": 1e-05,
+      "loss": 0.0199,
+      "num_tokens": 2287420.0,
+      "reward": 0.28125,
+      "reward_std": 0.29143062233924866,
+      "rewards/accuracy_reward/mean": 0.28125,
+      "rewards/accuracy_reward/std": 0.4513758420944214,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999316334724426,
+      "sampling/importance_sampling_ratio/min": 5.356698966352269e-05,
+      "sampling/sampling_logp_difference/max": 9.834577560424805,
+      "sampling/sampling_logp_difference/mean": 0.02137824520468712,
+      "step": 3
+    },
+    {
+      "clip_ratio/high_max": 1.7640652004047297e-05,
+      "clip_ratio/high_mean": 5.48578327652649e-06,
+      "clip_ratio/low_mean": 3.218628648937738e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.767206976590387e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 14690.0,
+      "completions/max_terminated_length": 14690.0,
+      "completions/mean_length": 5448.0234375,
+      "completions/mean_terminated_length": 5448.0234375,
+      "completions/min_length": 707.0,
+      "completions/min_terminated_length": 707.0,
+      "entropy": 1.1134418621659279,
+      "epoch": 0.0036798528058877645,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0016465173102915287,
+      "learning_rate": 1e-05,
+      "loss": 0.0433,
+      "num_tokens": 3009167.0,
+      "reward": 0.2890625,
+      "reward_std": 0.27958330512046814,
+      "rewards/accuracy_reward/mean": 0.2890625,
+      "rewards/accuracy_reward/std": 0.45510825514793396,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999774694442749,
+      "sampling/importance_sampling_ratio/min": 7.889385415182915e-06,
+      "sampling/sampling_logp_difference/max": 11.749992370605469,
+      "sampling/sampling_logp_difference/mean": 0.020580951124429703,
+      "step": 4
+    },
+    {
+      "clip_ratio/high_max": 1.3439519989333348e-05,
+      "clip_ratio/high_mean": 3.359879997333337e-06,
+      "clip_ratio/low_mean": 2.8849915906903334e-05,
+      "clip_ratio/low_min": 8.467687621305231e-06,
+      "clip_ratio/region_mean": 3.220979442630778e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 13420.0,
+      "completions/mean_length": 5436.8671875,
+      "completions/mean_terminated_length": 5350.66943359375,
+      "completions/min_length": 26.0,
+      "completions/min_terminated_length": 26.0,
+      "entropy": 1.1473859176039696,
+      "epoch": 0.004599816007359705,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0023770295083522797,
+      "learning_rate": 1e-05,
+      "loss": 0.0153,
+      "num_tokens": 3725654.0,
+      "reward": 0.2734375,
+      "reward_std": 0.27434611320495605,
+      "rewards/accuracy_reward/mean": 0.2734375,
+      "rewards/accuracy_reward/std": 0.447474867105484,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.99991774559021,
+      "sampling/importance_sampling_ratio/min": 0.0011146117467433214,
+      "sampling/sampling_logp_difference/max": 6.799249172210693,
+      "sampling/sampling_logp_difference/mean": 0.020377254113554955,
+      "step": 5
+    },
+    {
+      "clip_ratio/high_max": 4.652201369026443e-06,
+      "clip_ratio/high_mean": 1.1630503422566107e-06,
+      "clip_ratio/low_mean": 2.8399212624208303e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.9562263534899103e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 14440.0,
+      "completions/max_terminated_length": 14440.0,
+      "completions/mean_length": 4697.5390625,
+      "completions/mean_terminated_length": 4697.5390625,
+      "completions/min_length": 445.0,
+      "completions/min_terminated_length": 445.0,
+      "entropy": 1.0097229778766632,
+      "epoch": 0.005519779208831647,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.003342699259519577,
+      "learning_rate": 1e-05,
+      "loss": 0.0326,
+      "num_tokens": 4345547.0,
+      "reward": 0.390625,
+      "reward_std": 0.34480881690979004,
+      "rewards/accuracy_reward/mean": 0.390625,
+      "rewards/accuracy_reward/std": 0.4898075461387634,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999914765357971,
+      "sampling/importance_sampling_ratio/min": 0.002385853324085474,
+      "sampling/sampling_logp_difference/max": 6.038198471069336,
+      "sampling/sampling_logp_difference/mean": 0.0185473021119833,
+      "step": 6
+    },
+    {
+      "clip_ratio/high_max": 9.362594937556423e-06,
+      "clip_ratio/high_mean": 2.340648734389106e-06,
+      "clip_ratio/low_mean": 6.054362825125281e-05,
+      "clip_ratio/low_min": 7.427356649714056e-06,
+      "clip_ratio/region_mean": 6.288427744038927e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14652.0,
+      "completions/mean_length": 6218.2109375,
+      "completions/mean_terminated_length": 5890.2822265625,
+      "completions/min_length": 156.0,
+      "completions/min_terminated_length": 156.0,
+      "entropy": 1.0579778030514717,
+      "epoch": 0.006439742410303588,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.002073560608550906,
+      "learning_rate": 1e-05,
+      "loss": 0.0201,
+      "num_tokens": 5160646.0,
+      "reward": 0.2109375,
+      "reward_std": 0.27222445607185364,
+      "rewards/accuracy_reward/mean": 0.2109375,
+      "rewards/accuracy_reward/std": 0.4095771610736847,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999560117721558,
+      "sampling/importance_sampling_ratio/min": 0.00044544730917550623,
+      "sampling/sampling_logp_difference/max": 7.716431617736816,
+      "sampling/sampling_logp_difference/mean": 0.020321575924754143,
+      "step": 7
+    },
+    {
+      "clip_ratio/high_max": 1.1064067621191498e-05,
+      "clip_ratio/high_mean": 2.7660169052978745e-06,
+      "clip_ratio/low_mean": 2.2175867059104348e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.4941883737028547e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 13637.0,
+      "completions/mean_length": 5127.8359375,
+      "completions/mean_terminated_length": 5039.20458984375,
+      "completions/min_length": 556.0,
+      "completions/min_terminated_length": 556.0,
+      "entropy": 1.0472618415951729,
+      "epoch": 0.007359705611775529,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0032994600478559732,
+      "learning_rate": 1e-05,
+      "loss": 0.0751,
+      "num_tokens": 5836289.0,
+      "reward": 0.3359375,
+      "reward_std": 0.2948455810546875,
+      "rewards/accuracy_reward/mean": 0.3359375,
+      "rewards/accuracy_reward/std": 0.47417303919792175,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999483227729797,
+      "sampling/importance_sampling_ratio/min": 0.0013780994340777397,
+      "sampling/sampling_logp_difference/max": 6.587049961090088,
+      "sampling/sampling_logp_difference/mean": 0.01940803974866867,
+      "step": 8
+    },
+    {
+      "clip_ratio/high_max": 1.2357884770608507e-05,
+      "clip_ratio/high_mean": 3.0894711926521268e-06,
+      "clip_ratio/low_mean": 3.000627111759968e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.309574231025181e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15916.0,
+      "completions/mean_length": 4516.890625,
+      "completions/mean_terminated_length": 4423.44873046875,
+      "completions/min_length": 238.0,
+      "completions/min_terminated_length": 238.0,
+      "entropy": 0.911251038312912,
+      "epoch": 0.00827966881324747,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.003016560571268201,
+      "learning_rate": 1e-05,
+      "loss": 0.1006,
+      "num_tokens": 6433171.0,
+      "reward": 0.390625,
+      "reward_std": 0.3066929578781128,
+      "rewards/accuracy_reward/mean": 0.390625,
+      "rewards/accuracy_reward/std": 0.4898075461387634,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999179840087891,
+      "sampling/importance_sampling_ratio/min": 0.005480794236063957,
+      "sampling/sampling_logp_difference/max": 5.206505298614502,
+      "sampling/sampling_logp_difference/mean": 0.017437148839235306,
+      "step": 9
+    },
+    {
+      "clip_ratio/high_max": 4.6329013457580004e-05,
+      "clip_ratio/high_mean": 1.1582253364395001e-05,
+      "clip_ratio/low_mean": 7.069455705277505e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 8.227681109929108e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 13970.0,
+      "completions/mean_length": 4961.453125,
+      "completions/mean_terminated_length": 4687.31201171875,
+      "completions/min_length": 483.0,
+      "completions/min_terminated_length": 483.0,
+      "entropy": 0.6808596402406693,
+      "epoch": 0.00919963201471941,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.0035386616364121437,
+      "learning_rate": 1e-05,
+      "loss": 0.0596,
+      "num_tokens": 7085389.0,
+      "reward": 0.5625,
+      "reward_std": 0.3816363215446472,
+      "rewards/accuracy_reward/mean": 0.5625,
+      "rewards/accuracy_reward/std": 0.49802759289741516,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999173879623413,
+      "sampling/importance_sampling_ratio/min": 0.0002734088629949838,
+      "sampling/sampling_logp_difference/max": 8.20454216003418,
+      "sampling/sampling_logp_difference/mean": 0.01566406339406967,
+      "step": 10
+    },
+    {
+      "clip_ratio/high_max": 2.43190661421977e-05,
+      "clip_ratio/high_mean": 6.079766535549425e-06,
+      "clip_ratio/low_mean": 2.2395396172214532e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.8475162707763957e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14776.0,
+      "completions/mean_length": 4429.40625,
+      "completions/mean_terminated_length": 4335.275390625,
+      "completions/min_length": 25.0,
+      "completions/min_terminated_length": 25.0,
+      "entropy": 0.9181502386927605,
+      "epoch": 0.010119595216191352,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.0022535293828696012,
+      "learning_rate": 1e-05,
+      "loss": 0.0031,
+      "num_tokens": 7672185.0,
+      "reward": 0.3671875,
+      "reward_std": 0.20357418060302734,
+      "rewards/accuracy_reward/mean": 0.3671875,
+      "rewards/accuracy_reward/std": 0.4839322865009308,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998801946640015,
+      "sampling/importance_sampling_ratio/min": 5.315856554943821e-08,
+      "sampling/sampling_logp_difference/max": 16.74998664855957,
+      "sampling/sampling_logp_difference/mean": 0.018429335206747055,
+      "step": 11
+    },
+    {
+      "clip_ratio/high_max": 1.0117325928149512e-05,
+      "clip_ratio/high_mean": 2.529331482037378e-06,
+      "clip_ratio/low_mean": 1.1982813475697185e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 1.45121450714214e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14029.0,
+      "completions/mean_length": 5282.6796875,
+      "completions/mean_terminated_length": 5106.46875,
+      "completions/min_length": 323.0,
+      "completions/min_terminated_length": 323.0,
+      "entropy": 1.113751620054245,
+      "epoch": 0.011039558417663294,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0013591813622042537,
+      "learning_rate": 1e-05,
+      "loss": 0.0971,
+      "num_tokens": 8369000.0,
+      "reward": 0.3984375,
+      "reward_std": 0.3029736578464508,
+      "rewards/accuracy_reward/mean": 0.3984375,
+      "rewards/accuracy_reward/std": 0.4915000796318054,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998897314071655,
+      "sampling/importance_sampling_ratio/min": 3.970265970565379e-05,
+      "sampling/sampling_logp_difference/max": 10.134092330932617,
+      "sampling/sampling_logp_difference/mean": 0.020221836864948273,
+      "step": 12
+    },
+    {
+      "clip_ratio/high_max": 5.411958227341529e-06,
+      "clip_ratio/high_mean": 1.3529895568353822e-06,
+      "clip_ratio/low_mean": 2.5284593846208736e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.6637583516730956e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15925.0,
+      "completions/mean_length": 6970.421875,
+      "completions/mean_terminated_length": 6744.49609375,
+      "completions/min_length": 53.0,
+      "completions/min_terminated_length": 53.0,
+      "entropy": 1.1721933633089066,
+      "epoch": 0.011959521619135235,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.0024079051800072193,
+      "learning_rate": 1e-05,
+      "loss": 0.0713,
+      "num_tokens": 9283182.0,
+      "reward": 0.171875,
+      "reward_std": 0.17965975403785706,
+      "rewards/accuracy_reward/mean": 0.171875,
+      "rewards/accuracy_reward/std": 0.3787541687488556,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999163746833801,
+      "sampling/importance_sampling_ratio/min": 0.0008915197686292231,
+      "sampling/sampling_logp_difference/max": 7.0225830078125,
+      "sampling/sampling_logp_difference/mean": 0.021462474018335342,
+      "step": 13
+    },
+    {
+      "clip_ratio/high_max": 2.0661535927501973e-05,
+      "clip_ratio/high_mean": 5.165383981875493e-06,
+      "clip_ratio/low_mean": 2.4304956298237812e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.947033948430544e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 14658.0,
+      "completions/max_terminated_length": 14658.0,
+      "completions/mean_length": 4886.875,
+      "completions/mean_terminated_length": 4886.875,
+      "completions/min_length": 434.0,
+      "completions/min_terminated_length": 434.0,
+      "entropy": 1.0108910650014877,
+      "epoch": 0.012879484820607176,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.002063734456896782,
+      "learning_rate": 1e-05,
+      "loss": 0.0386,
+      "num_tokens": 9928446.0,
+      "reward": 0.3515625,
+      "reward_std": 0.2409384697675705,
+      "rewards/accuracy_reward/mean": 0.3515625,
+      "rewards/accuracy_reward/std": 0.4793342351913452,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000026226043701,
+      "sampling/importance_sampling_ratio/min": 0.0003672837920021266,
+      "sampling/sampling_logp_difference/max": 7.9093756675720215,
+      "sampling/sampling_logp_difference/mean": 0.01918785460293293,
+      "step": 14
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 4.4761846993424115e-06,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.4761846993424115e-06,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 12992.0,
+      "completions/max_terminated_length": 12992.0,
+      "completions/mean_length": 4824.0078125,
+      "completions/mean_terminated_length": 4824.0078125,
+      "completions/min_length": 103.0,
+      "completions/min_terminated_length": 103.0,
+      "entropy": 1.1070282831788063,
+      "epoch": 0.013799448022079117,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.002424790756776929,
+      "learning_rate": 1e-05,
+      "loss": 0.0485,
+      "num_tokens": 10566415.0,
+      "reward": 0.28125,
+      "reward_std": 0.23698672652244568,
+      "rewards/accuracy_reward/mean": 0.28125,
+      "rewards/accuracy_reward/std": 0.4513758420944214,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000125169754028,
+      "sampling/importance_sampling_ratio/min": 0.0011708867968991399,
+      "sampling/sampling_logp_difference/max": 6.749993801116943,
+      "sampling/sampling_logp_difference/mean": 0.02069389820098877,
+      "step": 15
+    },
+    {
+      "clip_ratio/high_max": 3.5075904634140898e-06,
+      "clip_ratio/high_mean": 8.768976158535224e-07,
+      "clip_ratio/low_mean": 2.2676964135825983e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.3553861751679506e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 12685.0,
+      "completions/mean_length": 5449.4140625,
+      "completions/mean_terminated_length": 5363.31494140625,
+      "completions/min_length": 23.0,
+      "completions/min_terminated_length": 23.0,
+      "entropy": 0.9817888736724854,
+      "epoch": 0.014719411223551058,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0021046048495918512,
+      "learning_rate": 1e-05,
+      "loss": 0.0252,
+      "num_tokens": 11281908.0,
+      "reward": 0.2265625,
+      "reward_std": 0.27168765664100647,
+      "rewards/accuracy_reward/mean": 0.2265625,
+      "rewards/accuracy_reward/std": 0.4202519655227661,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999805688858032,
+      "sampling/importance_sampling_ratio/min": 0.013273254036903381,
+      "sampling/sampling_logp_difference/max": 4.322004318237305,
+      "sampling/sampling_logp_difference/mean": 0.019556276500225067,
+      "step": 16
+    },
+    {
+      "clip_ratio/high_max": 1.624216065465589e-05,
+      "clip_ratio/high_mean": 4.060540163663973e-06,
+      "clip_ratio/low_mean": 5.4349347919924185e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.840988796990132e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 14133.0,
+      "completions/max_terminated_length": 14133.0,
+      "completions/mean_length": 5343.25,
+      "completions/mean_terminated_length": 5343.25,
+      "completions/min_length": 324.0,
+      "completions/min_terminated_length": 324.0,
+      "entropy": 1.04741720110178,
+      "epoch": 0.015639374425023,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0035894038155674934,
+      "learning_rate": 1e-05,
+      "loss": 0.0584,
+      "num_tokens": 11987692.0,
+      "reward": 0.3359375,
+      "reward_std": 0.3124620020389557,
+      "rewards/accuracy_reward/mean": 0.3359375,
+      "rewards/accuracy_reward/std": 0.47417303919792175,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998996257781982,
+      "sampling/importance_sampling_ratio/min": 2.1446165192173794e-05,
+      "sampling/sampling_logp_difference/max": 10.749964714050293,
+      "sampling/sampling_logp_difference/mean": 0.020530637353658676,
+      "step": 17
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 4.272115029380075e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.272115029380075e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15138.0,
+      "completions/mean_length": 6301.9375,
+      "completions/mean_terminated_length": 5806.09814453125,
+      "completions/min_length": 72.0,
+      "completions/min_terminated_length": 72.0,
+      "entropy": 0.8892941772937775,
+      "epoch": 0.01655933762649494,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0032246762420982122,
+      "learning_rate": 1e-05,
+      "loss": 0.0811,
+      "num_tokens": 12814244.0,
+      "reward": 0.3125,
+      "reward_std": 0.3606000542640686,
+      "rewards/accuracy_reward/mean": 0.3125,
+      "rewards/accuracy_reward/std": 0.4653336703777313,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999184608459473,
+      "sampling/importance_sampling_ratio/min": 0.021351110190153122,
+      "sampling/sampling_logp_difference/max": 3.846651554107666,
+      "sampling/sampling_logp_difference/mean": 0.017541853711009026,
+      "step": 18
+    },
+    {
+      "clip_ratio/high_max": 9.956602298188955e-06,
+      "clip_ratio/high_mean": 2.4891505745472386e-06,
+      "clip_ratio/low_mean": 2.772165316855535e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.0210803743102588e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 16213.0,
+      "completions/max_terminated_length": 16213.0,
+      "completions/mean_length": 5297.46875,
+      "completions/mean_terminated_length": 5297.46875,
+      "completions/min_length": 2.0,
+      "completions/min_terminated_length": 2.0,
+      "entropy": 0.8097029253840446,
+      "epoch": 0.017479300827966882,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0023969109170138836,
+      "learning_rate": 1e-05,
+      "loss": -0.0153,
+      "num_tokens": 13512520.0,
+      "reward": 0.359375,
+      "reward_std": 0.248829185962677,
+      "rewards/accuracy_reward/mean": 0.359375,
+      "rewards/accuracy_reward/std": 0.481702595949173,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999222159385681,
+      "sampling/importance_sampling_ratio/min": 0.005766105372458696,
+      "sampling/sampling_logp_difference/max": 5.155758380889893,
+      "sampling/sampling_logp_difference/mean": 0.017464376986026764,
+      "step": 19
+    },
+    {
+      "clip_ratio/high_max": 1.0098337497765897e-05,
+      "clip_ratio/high_mean": 2.524584374441474e-06,
+      "clip_ratio/low_mean": 3.173396362399217e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.425854845318099e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14655.0,
+      "completions/mean_length": 4890.34375,
+      "completions/mean_terminated_length": 4799.84228515625,
+      "completions/min_length": 68.0,
+      "completions/min_terminated_length": 68.0,
+      "entropy": 0.9267145916819572,
+      "epoch": 0.01839926402943882,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.002759338356554508,
+      "learning_rate": 1e-05,
+      "loss": -0.0014,
+      "num_tokens": 14155556.0,
+      "reward": 0.3515625,
+      "reward_std": 0.31010788679122925,
+      "rewards/accuracy_reward/mean": 0.3515625,
+      "rewards/accuracy_reward/std": 0.4793342351913452,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999570250511169,
+      "sampling/importance_sampling_ratio/min": 0.008491010405123234,
+      "sampling/sampling_logp_difference/max": 4.768747329711914,
+      "sampling/sampling_logp_difference/mean": 0.018839433789253235,
+      "step": 20
+    },
+    {
+      "clip_ratio/high_max": 7.532389190600952e-06,
+      "clip_ratio/high_mean": 1.883097297650238e-06,
+      "clip_ratio/low_mean": 1.9051809317716106e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.0934906729053182e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 16296.0,
+      "completions/max_terminated_length": 16296.0,
+      "completions/mean_length": 4609.40625,
+      "completions/mean_terminated_length": 4609.40625,
+      "completions/min_length": 461.0,
+      "completions/min_terminated_length": 461.0,
+      "entropy": 1.171089917421341,
+      "epoch": 0.019319227230910764,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0021055075339972973,
+      "learning_rate": 1e-05,
+      "loss": -0.0051,
+      "num_tokens": 14765328.0,
+      "reward": 0.2421875,
+      "reward_std": 0.2409384548664093,
+      "rewards/accuracy_reward/mean": 0.2421875,
+      "rewards/accuracy_reward/std": 0.4300905168056488,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999741911888123,
+      "sampling/importance_sampling_ratio/min": 5.368983693188056e-07,
+      "sampling/sampling_logp_difference/max": 14.437457084655762,
+      "sampling/sampling_logp_difference/mean": 0.020226795226335526,
+      "step": 21
+    },
+    {
+      "clip_ratio/high_max": 1.7169573766295798e-05,
+      "clip_ratio/high_mean": 4.2923934415739495e-06,
+      "clip_ratio/low_mean": 5.869748633813288e-06,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 1.0162142189074075e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14299.0,
+      "completions/mean_length": 5099.0390625,
+      "completions/mean_terminated_length": 5010.18115234375,
+      "completions/min_length": 539.0,
+      "completions/min_terminated_length": 539.0,
+      "entropy": 1.005959376692772,
+      "epoch": 0.020239190432382703,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0027595218271017075,
+      "learning_rate": 1e-05,
+      "loss": 0.0236,
+      "num_tokens": 15438549.0,
+      "reward": 0.296875,
+      "reward_std": 0.20069602131843567,
+      "rewards/accuracy_reward/mean": 0.296875,
+      "rewards/accuracy_reward/std": 0.45867621898651123,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999887347221375,
+      "sampling/importance_sampling_ratio/min": 0.00013984869292471558,
+      "sampling/sampling_logp_difference/max": 8.87494945526123,
+      "sampling/sampling_logp_difference/mean": 0.01902824640274048,
+      "step": 22
+    },
+    {
+      "clip_ratio/high_max": 5.162942670722259e-06,
+      "clip_ratio/high_mean": 1.2907356676805648e-06,
+      "clip_ratio/low_mean": 3.6872071063953626e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.816280593582633e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16204.0,
+      "completions/mean_length": 7138.0390625,
+      "completions/mean_terminated_length": 6839.7822265625,
+      "completions/min_length": 729.0,
+      "completions/min_terminated_length": 729.0,
+      "entropy": 1.0403362140059471,
+      "epoch": 0.021159153633854646,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.002748022088780999,
+      "learning_rate": 1e-05,
+      "loss": 0.0647,
+      "num_tokens": 16373898.0,
+      "reward": 0.296875,
+      "reward_std": 0.3169426918029785,
+      "rewards/accuracy_reward/mean": 0.296875,
+      "rewards/accuracy_reward/std": 0.45867621898651123,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999048709869385,
+      "sampling/importance_sampling_ratio/min": 0.0003802926803473383,
+      "sampling/sampling_logp_difference/max": 7.874569416046143,
+      "sampling/sampling_logp_difference/mean": 0.020853528752923012,
+      "step": 23
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 5.6506045439164154e-05,
+      "clip_ratio/low_min": 5.709326615033206e-06,
+      "clip_ratio/region_mean": 5.6506045439164154e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14543.0,
+      "completions/mean_length": 5420.515625,
+      "completions/mean_terminated_length": 5334.18896484375,
+      "completions/min_length": 19.0,
+      "completions/min_terminated_length": 19.0,
+      "entropy": 1.1339883506298065,
+      "epoch": 0.02207911683532659,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0029502976685762405,
+      "learning_rate": 1e-05,
+      "loss": 0.0756,
+      "num_tokens": 17088156.0,
+      "reward": 0.1953125,
+      "reward_std": 0.25620076060295105,
+      "rewards/accuracy_reward/mean": 0.1953125,
+      "rewards/accuracy_reward/std": 0.3979988098144531,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999445676803589,
+      "sampling/importance_sampling_ratio/min": 9.70982582657598e-05,
+      "sampling/sampling_logp_difference/max": 9.239787101745605,
+      "sampling/sampling_logp_difference/mean": 0.0199423898011446,
+      "step": 24
+    },
+    {
+      "clip_ratio/high_max": 5.619998319161823e-06,
+      "clip_ratio/high_mean": 1.4049995797904558e-06,
+      "clip_ratio/low_mean": 6.439320418394345e-05,
+      "clip_ratio/low_min": 4.70632539872895e-06,
+      "clip_ratio/region_mean": 6.57982034226734e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14636.0,
+      "completions/mean_length": 5116.3046875,
+      "completions/mean_terminated_length": 4845.88037109375,
+      "completions/min_length": 10.0,
+      "completions/min_terminated_length": 10.0,
+      "entropy": 0.9503882825374603,
+      "epoch": 0.022999080036798528,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.004891107324510813,
+      "learning_rate": 1e-05,
+      "loss": 0.0522,
+      "num_tokens": 17766619.0,
+      "reward": 0.3203125,
+      "reward_std": 0.3366856575012207,
+      "rewards/accuracy_reward/mean": 0.3203125,
+      "rewards/accuracy_reward/std": 0.4684300124645233,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999970018863678,
+      "sampling/importance_sampling_ratio/min": 0.0010618992382660508,
+      "sampling/sampling_logp_difference/max": 6.847696304321289,
+      "sampling/sampling_logp_difference/mean": 0.01914183795452118,
+      "step": 25
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 3.839018643247982e-05,
+      "clip_ratio/low_min": 4.115091087442124e-06,
+      "clip_ratio/region_mean": 3.839018643247982e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14634.0,
+      "completions/mean_length": 5061.8671875,
+      "completions/mean_terminated_length": 4972.71630859375,
+      "completions/min_length": 281.0,
+      "completions/min_terminated_length": 281.0,
+      "entropy": 1.0540335327386856,
+      "epoch": 0.02391904323827047,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0030373274348676205,
+      "learning_rate": 1e-05,
+      "loss": 0.0246,
+      "num_tokens": 18432938.0,
+      "reward": 0.34375,
+      "reward_std": 0.28118088841438293,
+      "rewards/accuracy_reward/mean": 0.34375,
+      "rewards/accuracy_reward/std": 0.47682511806488037,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999624490737915,
+      "sampling/importance_sampling_ratio/min": 1.7212972807101323e-06,
+      "sampling/sampling_logp_difference/max": 13.272432327270508,
+      "sampling/sampling_logp_difference/mean": 0.019548218697309494,
+      "step": 26
+    },
+    {
+      "clip_ratio/high_max": 1.4656657867817557e-05,
+      "clip_ratio/high_mean": 4.665093399580655e-06,
+      "clip_ratio/low_mean": 3.751162262233265e-05,
+      "clip_ratio/low_min": 4.413062470121076e-06,
+      "clip_ratio/region_mean": 4.2176716192443564e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15782.0,
+      "completions/max_terminated_length": 15782.0,
+      "completions/mean_length": 6349.9765625,
+      "completions/mean_terminated_length": 6349.9765625,
+      "completions/min_length": 2.0,
+      "completions/min_terminated_length": 2.0,
+      "entropy": 1.0268081277608871,
+      "epoch": 0.02483900643974241,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0017623496241867542,
+      "learning_rate": 1e-05,
+      "loss": 0.0011,
+      "num_tokens": 19264743.0,
+      "reward": 0.2734375,
+      "reward_std": 0.33903974294662476,
+      "rewards/accuracy_reward/mean": 0.2734375,
+      "rewards/accuracy_reward/std": 0.447474867105484,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000061988830566,
+      "sampling/importance_sampling_ratio/min": 6.870362267363816e-05,
+      "sampling/sampling_logp_difference/max": 9.585708618164062,
+      "sampling/sampling_logp_difference/mean": 0.019106190651655197,
+      "step": 27
+    },
+    {
+      "clip_ratio/high_max": 9.221375876222737e-06,
+      "clip_ratio/high_mean": 2.3053439690556843e-06,
+      "clip_ratio/low_mean": 3.09787185415189e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.328406273794826e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15944.0,
+      "completions/mean_length": 5815.484375,
+      "completions/mean_terminated_length": 5561.84033203125,
+      "completions/min_length": 607.0,
+      "completions/min_terminated_length": 607.0,
+      "entropy": 1.0389493256807327,
+      "epoch": 0.025758969641214352,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.003111837198957801,
+      "learning_rate": 1e-05,
+      "loss": -0.0162,
+      "num_tokens": 20030109.0,
+      "reward": 0.34375,
+      "reward_std": 0.32719242572784424,
+      "rewards/accuracy_reward/mean": 0.34375,
+      "rewards/accuracy_reward/std": 0.47682511806488037,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000298023223877,
+      "sampling/importance_sampling_ratio/min": 0.02987043187022209,
+      "sampling/sampling_logp_difference/max": 3.5108861923217773,
+      "sampling/sampling_logp_difference/mean": 0.020060991868376732,
+      "step": 28
+    },
+    {
+      "clip_ratio/high_max": 6.7810142354574054e-06,
+      "clip_ratio/high_mean": 1.6952535588643514e-06,
+      "clip_ratio/low_mean": 4.474762545214617e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.644287901101052e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15371.0,
+      "completions/mean_length": 5157.1484375,
+      "completions/mean_terminated_length": 5068.748046875,
+      "completions/min_length": 11.0,
+      "completions/min_terminated_length": 11.0,
+      "entropy": 1.0510126948356628,
+      "epoch": 0.02667893284268629,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.003041633637621999,
+      "learning_rate": 1e-05,
+      "loss": 0.0471,
+      "num_tokens": 20710904.0,
+      "reward": 0.3125,
+      "reward_std": 0.35612428188323975,
+      "rewards/accuracy_reward/mean": 0.3125,
+      "rewards/accuracy_reward/std": 0.4653336703777313,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999587535858154,
+      "sampling/importance_sampling_ratio/min": 0.04357198625802994,
+      "sampling/sampling_logp_difference/max": 3.133340835571289,
+      "sampling/sampling_logp_difference/mean": 0.019007597118616104,
+      "step": 29
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 2.0962848566341563e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.0962848566341563e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15333.0,
+      "completions/max_terminated_length": 15333.0,
+      "completions/mean_length": 4446.3828125,
+      "completions/mean_terminated_length": 4446.3828125,
+      "completions/min_length": 73.0,
+      "completions/min_terminated_length": 73.0,
+      "entropy": 1.053279548883438,
+      "epoch": 0.027598896044158234,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0022369560319930315,
+      "learning_rate": 1e-05,
+      "loss": -0.001,
+      "num_tokens": 21298497.0,
+      "reward": 0.390625,
+      "reward_std": 0.24169495701789856,
+      "rewards/accuracy_reward/mean": 0.390625,
+      "rewards/accuracy_reward/std": 0.4898075461387634,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998750686645508,
+      "sampling/importance_sampling_ratio/min": 0.006704842206090689,
+      "sampling/sampling_logp_difference/max": 5.00492525100708,
+      "sampling/sampling_logp_difference/mean": 0.01947362720966339,
+      "step": 30
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 2.8460265411922592e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.8460265411922592e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15386.0,
+      "completions/mean_length": 6294.1484375,
+      "completions/mean_terminated_length": 6133.9921875,
+      "completions/min_length": 548.0,
+      "completions/min_terminated_length": 548.0,
+      "entropy": 1.2036212533712387,
+      "epoch": 0.028518859245630176,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0021383841522037983,
+      "learning_rate": 1e-05,
+      "loss": 0.033,
+      "num_tokens": 22124812.0,
+      "reward": 0.171875,
+      "reward_std": 0.20752590894699097,
+      "rewards/accuracy_reward/mean": 0.171875,
+      "rewards/accuracy_reward/std": 0.3787541687488556,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999858736991882,
+      "sampling/importance_sampling_ratio/min": 3.9575263599544996e-07,
+      "sampling/sampling_logp_difference/max": 14.742476463317871,
+      "sampling/sampling_logp_difference/mean": 0.022367021068930626,
+      "step": 31
+    },
+    {
+      "clip_ratio/high_max": 1.73864664247958e-05,
+      "clip_ratio/high_mean": 4.34661660619895e-06,
+      "clip_ratio/low_mean": 3.19569651310303e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.630358173722925e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14893.0,
+      "completions/mean_length": 6011.4921875,
+      "completions/mean_terminated_length": 5929.81884765625,
+      "completions/min_length": 2.0,
+      "completions/min_terminated_length": 2.0,
+      "entropy": 1.123318687081337,
+      "epoch": 0.029438822447102116,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.00126531848218292,
+      "learning_rate": 1e-05,
+      "loss": 0.0119,
+      "num_tokens": 22915091.0,
+      "reward": 0.171875,
+      "reward_std": 0.2330477386713028,
+      "rewards/accuracy_reward/mean": 0.171875,
+      "rewards/accuracy_reward/std": 0.3787541687488556,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999861121177673,
+      "sampling/importance_sampling_ratio/min": 1.6368276192224585e-05,
+      "sampling/sampling_logp_difference/max": 11.02016544342041,
+      "sampling/sampling_logp_difference/mean": 0.019905246794223785,
+      "step": 32
+    },
+    {
+      "clip_ratio/high_max": 2.8753217975463485e-05,
+      "clip_ratio/high_mean": 7.188304493865871e-06,
+      "clip_ratio/low_mean": 3.818478444372886e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.537308905128157e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16332.0,
+      "completions/mean_length": 5152.46875,
+      "completions/mean_terminated_length": 5064.03125,
+      "completions/min_length": 128.0,
+      "completions/min_terminated_length": 128.0,
+      "entropy": 1.0477670058608055,
+      "epoch": 0.03035878564857406,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0030069497879594564,
+      "learning_rate": 1e-05,
+      "loss": 0.1026,
+      "num_tokens": 23596487.0,
+      "reward": 0.3359375,
+      "reward_std": 0.29142576456069946,
+      "rewards/accuracy_reward/mean": 0.3359375,
+      "rewards/accuracy_reward/std": 0.47417303919792175,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999433755874634,
+      "sampling/importance_sampling_ratio/min": 9.009604013954231e-07,
+      "sampling/sampling_logp_difference/max": 13.919804573059082,
+      "sampling/sampling_logp_difference/mean": 0.019003981724381447,
+      "step": 33
+    },
+    {
+      "clip_ratio/high_max": 3.069575450354023e-05,
+      "clip_ratio/high_mean": 7.673938625885057e-06,
+      "clip_ratio/low_mean": 3.4847614415411954e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.252155258654966e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 12792.0,
+      "completions/max_terminated_length": 12792.0,
+      "completions/mean_length": 4672.5703125,
+      "completions/mean_terminated_length": 4672.5703125,
+      "completions/min_length": 4.0,
+      "completions/min_terminated_length": 4.0,
+      "entropy": 0.9471446052193642,
+      "epoch": 0.031278748850046,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.002676331205293536,
+      "learning_rate": 1e-05,
+      "loss": 0.0724,
+      "num_tokens": 24213408.0,
+      "reward": 0.3203125,
+      "reward_std": 0.2988021969795227,
+      "rewards/accuracy_reward/mean": 0.3203125,
+      "rewards/accuracy_reward/std": 0.4684300124645233,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000251531600952,
+      "sampling/importance_sampling_ratio/min": 0.0013351094676181674,
+      "sampling/sampling_logp_difference/max": 6.618741989135742,
+      "sampling/sampling_logp_difference/mean": 0.0179576613008976,
+      "step": 34
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 2.6127243245355203e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.6127243245355203e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16108.0,
+      "completions/mean_length": 7013.734375,
+      "completions/mean_terminated_length": 6711.4677734375,
+      "completions/min_length": 326.0,
+      "completions/min_terminated_length": 326.0,
+      "entropy": 1.1254516392946243,
+      "epoch": 0.03219871205151794,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0023615453392267227,
+      "learning_rate": 1e-05,
+      "loss": 0.0384,
+      "num_tokens": 25130262.0,
+      "reward": 0.1953125,
+      "reward_std": 0.26485776901245117,
+      "rewards/accuracy_reward/mean": 0.1953125,
+      "rewards/accuracy_reward/std": 0.3979988098144531,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999954342842102,
+      "sampling/importance_sampling_ratio/min": 6.6197676460433286e-06,
+      "sampling/sampling_logp_difference/max": 11.925450325012207,
+      "sampling/sampling_logp_difference/mean": 0.0215257927775383,
+      "step": 35
+    },
+    {
+      "clip_ratio/high_max": 4.06954040954588e-06,
+      "clip_ratio/high_mean": 1.01738510238647e-06,
+      "clip_ratio/low_mean": 4.180071573500754e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.281810015527299e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15673.0,
+      "completions/mean_length": 5858.59375,
+      "completions/mean_terminated_length": 5605.984375,
+      "completions/min_length": 189.0,
+      "completions/min_terminated_length": 189.0,
+      "entropy": 1.0713739022612572,
+      "epoch": 0.03311867525298988,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0029018481727689505,
+      "learning_rate": 1e-05,
+      "loss": 0.1041,
+      "num_tokens": 25898194.0,
+      "reward": 0.3671875,
+      "reward_std": 0.29036980867385864,
+      "rewards/accuracy_reward/mean": 0.3671875,
+      "rewards/accuracy_reward/std": 0.4839322865009308,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999915957450867,
+      "sampling/importance_sampling_ratio/min": 1.6834765119710937e-05,
+      "sampling/sampling_logp_difference/max": 10.992064476013184,
+      "sampling/sampling_logp_difference/mean": 0.019959844648838043,
+      "step": 36
+    },
+    {
+      "clip_ratio/high_max": 1.2810827229259303e-05,
+      "clip_ratio/high_mean": 3.2027068073148257e-06,
+      "clip_ratio/low_mean": 3.29701083501277e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.617281504375569e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14004.0,
+      "completions/mean_length": 6952.6015625,
+      "completions/mean_terminated_length": 6726.24853515625,
+      "completions/min_length": 3.0,
+      "completions/min_terminated_length": 3.0,
+      "entropy": 1.028619796037674,
+      "epoch": 0.03403863845446182,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0022342968732118607,
+      "learning_rate": 1e-05,
+      "loss": 0.0637,
+      "num_tokens": 26812791.0,
+      "reward": 0.234375,
+      "reward_std": 0.26827272772789,
+      "rewards/accuracy_reward/mean": 0.234375,
+      "rewards/accuracy_reward/std": 0.42527204751968384,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999532699584961,
+      "sampling/importance_sampling_ratio/min": 4.540153167909011e-05,
+      "sampling/sampling_logp_difference/max": 9.999964714050293,
+      "sampling/sampling_logp_difference/mean": 0.02002539485692978,
+      "step": 37
+    },
+    {
+      "clip_ratio/high_max": 1.5225089100567857e-05,
+      "clip_ratio/high_mean": 6.960676159906143e-06,
+      "clip_ratio/low_mean": 4.09088329433871e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.7869508762232726e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16361.0,
+      "completions/mean_length": 6413.421875,
+      "completions/mean_terminated_length": 6174.12841796875,
+      "completions/min_length": 2.0,
+      "completions/min_terminated_length": 2.0,
+      "entropy": 0.9452399462461472,
+      "epoch": 0.034958601655933765,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0021800603717565536,
+      "learning_rate": 1e-05,
+      "loss": 0.0275,
+      "num_tokens": 27652757.0,
+      "reward": 0.296875,
+      "reward_std": 0.31246688961982727,
+      "rewards/accuracy_reward/mean": 0.296875,
+      "rewards/accuracy_reward/std": 0.45867621898651123,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999439120292664,
+      "sampling/importance_sampling_ratio/min": 3.895394547726028e-05,
+      "sampling/sampling_logp_difference/max": 10.153130531311035,
+      "sampling/sampling_logp_difference/mean": 0.019722118973731995,
+      "step": 38
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 3.9564903318023426e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.9564903318023426e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15754.0,
+      "completions/max_terminated_length": 15754.0,
+      "completions/mean_length": 5176.3515625,
+      "completions/mean_terminated_length": 5176.3515625,
+      "completions/min_length": 130.0,
+      "completions/min_terminated_length": 130.0,
+      "entropy": 1.0444758981466293,
+      "epoch": 0.035878564857405704,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.004153470974415541,
+      "learning_rate": 1e-05,
+      "loss": 0.0798,
+      "num_tokens": 28334386.0,
+      "reward": 0.2734375,
+      "reward_std": 0.32035762071609497,
+      "rewards/accuracy_reward/mean": 0.2734375,
+      "rewards/accuracy_reward/std": 0.447474867105484,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999774694442749,
+      "sampling/importance_sampling_ratio/min": 0.007421077694743872,
+      "sampling/sampling_logp_difference/max": 4.903430938720703,
+      "sampling/sampling_logp_difference/mean": 0.020159056410193443,
+      "step": 39
+    },
+    {
+      "clip_ratio/high_max": 1.725743459246587e-05,
+      "clip_ratio/high_mean": 4.3143586481164675e-06,
+      "clip_ratio/low_mean": 2.0204584302518924e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.451894306432223e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15554.0,
+      "completions/mean_length": 5178.9921875,
+      "completions/mean_terminated_length": 5001.13525390625,
+      "completions/min_length": 4.0,
+      "completions/min_terminated_length": 4.0,
+      "entropy": 1.0803537145256996,
+      "epoch": 0.03679852805887764,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.002477057045325637,
+      "learning_rate": 1e-05,
+      "loss": 0.0067,
+      "num_tokens": 29017145.0,
+      "reward": 0.2890625,
+      "reward_std": 0.29932135343551636,
+      "rewards/accuracy_reward/mean": 0.2890625,
+      "rewards/accuracy_reward/std": 0.45510825514793396,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000497102737427,
+      "sampling/importance_sampling_ratio/min": 0.004630985204130411,
+      "sampling/sampling_logp_difference/max": 5.374985694885254,
+      "sampling/sampling_logp_difference/mean": 0.019826076924800873,
+      "step": 40
+    },
+    {
+      "clip_ratio/high_max": 1.6637992303003557e-05,
+      "clip_ratio/high_mean": 4.159498075750889e-06,
+      "clip_ratio/low_mean": 2.1970684144889674e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.6130182106953725e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 14131.0,
+      "completions/max_terminated_length": 14131.0,
+      "completions/mean_length": 4980.359375,
+      "completions/mean_terminated_length": 4980.359375,
+      "completions/min_length": 329.0,
+      "completions/min_terminated_length": 329.0,
+      "entropy": 0.9510642662644386,
+      "epoch": 0.03771849126034959,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0016275218222290277,
+      "learning_rate": 1e-05,
+      "loss": -0.0097,
+      "num_tokens": 29673535.0,
+      "reward": 0.4375,
+      "reward_std": 0.26249876618385315,
+      "rewards/accuracy_reward/mean": 0.4375,
+      "rewards/accuracy_reward/std": 0.49802759289741516,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999750852584839,
+      "sampling/importance_sampling_ratio/min": 0.000599516904912889,
+      "sampling/sampling_logp_difference/max": 7.419386386871338,
+      "sampling/sampling_logp_difference/mean": 0.01844976656138897,
+      "step": 41
+    },
+    {
+      "clip_ratio/high_max": 2.8087193186365766e-05,
+      "clip_ratio/high_mean": 7.021798296591442e-06,
+      "clip_ratio/low_mean": 3.9683913541921356e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.670571286169434e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15328.0,
+      "completions/mean_length": 5778.6953125,
+      "completions/mean_terminated_length": 5695.18896484375,
+      "completions/min_length": 691.0,
+      "completions/min_terminated_length": 691.0,
+      "entropy": 1.0413239300251007,
+      "epoch": 0.03863845446182153,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.001847646082751453,
+      "learning_rate": 1e-05,
+      "loss": -0.0045,
+      "num_tokens": 30436416.0,
+      "reward": 0.2578125,
+      "reward_std": 0.33903977274894714,
+      "rewards/accuracy_reward/mean": 0.2578125,
+      "rewards/accuracy_reward/std": 0.43914902210235596,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998501539230347,
+      "sampling/importance_sampling_ratio/min": 0.00020348970429040492,
+      "sampling/sampling_logp_difference/max": 8.499895095825195,
+      "sampling/sampling_logp_difference/mean": 0.021502099931240082,
+      "step": 42
+    },
+    {
+      "clip_ratio/high_max": 2.68402091023745e-05,
+      "clip_ratio/high_mean": 8.575278570788214e-06,
+      "clip_ratio/low_mean": 4.547183698377921e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.404711600931478e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 14182.0,
+      "completions/max_terminated_length": 14182.0,
+      "completions/mean_length": 4875.125,
+      "completions/mean_terminated_length": 4875.125,
+      "completions/min_length": 349.0,
+      "completions/min_terminated_length": 349.0,
+      "entropy": 1.0464690178632736,
+      "epoch": 0.03955841766329347,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0021134833805263042,
+      "learning_rate": 1e-05,
+      "loss": 0.0727,
+      "num_tokens": 31083672.0,
+      "reward": 0.40625,
+      "reward_std": 0.3584783971309662,
+      "rewards/accuracy_reward/mean": 0.40625,
+      "rewards/accuracy_reward/std": 0.4930621087551117,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999340176582336,
+      "sampling/importance_sampling_ratio/min": 0.012113225646317005,
+      "sampling/sampling_logp_difference/max": 4.41345739364624,
+      "sampling/sampling_logp_difference/mean": 0.019140049815177917,
+      "step": 43
+    },
+    {
+      "clip_ratio/high_max": 3.9877967992651975e-05,
+      "clip_ratio/high_mean": 9.969491998162994e-06,
+      "clip_ratio/low_mean": 3.981287841270387e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.9782369273998484e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15959.0,
+      "completions/mean_length": 4691.421875,
+      "completions/mean_terminated_length": 4505.82568359375,
+      "completions/min_length": 296.0,
+      "completions/min_terminated_length": 296.0,
+      "entropy": 1.0229775309562683,
+      "epoch": 0.040478380864765406,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0037735572550445795,
+      "learning_rate": 1e-05,
+      "loss": 0.0603,
+      "num_tokens": 31703654.0,
+      "reward": 0.4453125,
+      "reward_std": 0.2993389964103699,
+      "rewards/accuracy_reward/mean": 0.4453125,
+      "rewards/accuracy_reward/std": 0.4989531338214874,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999492168426514,
+      "sampling/importance_sampling_ratio/min": 0.03150063753128052,
+      "sampling/sampling_logp_difference/max": 3.457747459411621,
+      "sampling/sampling_logp_difference/mean": 0.01912039890885353,
+      "step": 44
+    },
+    {
+      "clip_ratio/high_max": 3.5441889849607833e-06,
+      "clip_ratio/high_mean": 8.860472462401958e-07,
+      "clip_ratio/low_mean": 1.5137359810069029e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 1.6023407056309225e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15605.0,
+      "completions/mean_length": 6821.96875,
+      "completions/mean_terminated_length": 6592.48046875,
+      "completions/min_length": 1196.0,
+      "completions/min_terminated_length": 1196.0,
+      "entropy": 1.1132484003901482,
+      "epoch": 0.04139834406623735,
+      "frac_reward_zero_std": 0.625,
+      "grad_norm": 0.0010448681423440576,
+      "learning_rate": 1e-05,
+      "loss": 0.022,
+      "num_tokens": 32599778.0,
+      "reward": 0.2265625,
+      "reward_std": 0.1814819872379303,
+      "rewards/accuracy_reward/mean": 0.2265625,
+      "rewards/accuracy_reward/std": 0.4202519655227661,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999915361404419,
+      "sampling/importance_sampling_ratio/min": 0.006500681862235069,
+      "sampling/sampling_logp_difference/max": 5.035848140716553,
+      "sampling/sampling_logp_difference/mean": 0.02125459350645542,
+      "step": 45
+    },
+    {
+      "clip_ratio/high_max": 4.652893949241843e-06,
+      "clip_ratio/high_mean": 1.1632234873104608e-06,
+      "clip_ratio/low_mean": 5.731516603191267e-05,
+      "clip_ratio/low_min": 9.891066838463303e-06,
+      "clip_ratio/region_mean": 5.8478389746596804e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15753.0,
+      "completions/mean_length": 6834.3671875,
+      "completions/mean_terminated_length": 6605.17626953125,
+      "completions/min_length": 624.0,
+      "completions/min_terminated_length": 624.0,
+      "entropy": 0.9827468693256378,
+      "epoch": 0.04231830726770929,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.0017670176457613707,
+      "learning_rate": 1e-05,
+      "loss": 0.1105,
+      "num_tokens": 33492737.0,
+      "reward": 0.3046875,
+      "reward_std": 0.3440523147583008,
+      "rewards/accuracy_reward/mean": 0.3046875,
+      "rewards/accuracy_reward/std": 0.46208351850509644,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999089241027832,
+      "sampling/importance_sampling_ratio/min": 0.0021202093921601772,
+      "sampling/sampling_logp_difference/max": 6.156240463256836,
+      "sampling/sampling_logp_difference/mean": 0.019490526989102364,
+      "step": 46
+    },
+    {
+      "clip_ratio/high_max": 6.717360520269722e-06,
+      "clip_ratio/high_mean": 2.503530367903295e-06,
+      "clip_ratio/low_mean": 2.5672919832686603e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.8176450200589898e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14098.0,
+      "completions/mean_length": 6175.296875,
+      "completions/mean_terminated_length": 5845.98388671875,
+      "completions/min_length": 558.0,
+      "completions/min_terminated_length": 558.0,
+      "entropy": 1.1584237962961197,
+      "epoch": 0.04323827046918123,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.0016891945851966739,
+      "learning_rate": 1e-05,
+      "loss": -0.0008,
+      "num_tokens": 34312455.0,
+      "reward": 0.1875,
+      "reward_std": 0.19673937559127808,
+      "rewards/accuracy_reward/mean": 0.1875,
+      "rewards/accuracy_reward/std": 0.39184603095054626,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999643564224243,
+      "sampling/importance_sampling_ratio/min": 8.086384332273155e-05,
+      "sampling/sampling_logp_difference/max": 9.422743797302246,
+      "sampling/sampling_logp_difference/mean": 0.021749887615442276,
+      "step": 47
+    },
+    {
+      "clip_ratio/high_max": 2.2362002255249536e-05,
+      "clip_ratio/high_mean": 8.189798336388776e-06,
+      "clip_ratio/low_mean": 2.1058204993096297e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.9248002192616696e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16054.0,
+      "completions/mean_length": 6036.8359375,
+      "completions/mean_terminated_length": 5955.3623046875,
+      "completions/min_length": 510.0,
+      "completions/min_terminated_length": 510.0,
+      "entropy": 0.9301538467407227,
+      "epoch": 0.04415823367065318,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.003834392176941037,
+      "learning_rate": 1e-05,
+      "loss": 0.0636,
+      "num_tokens": 35102738.0,
+      "reward": 0.4375,
+      "reward_std": 0.36614155769348145,
+      "rewards/accuracy_reward/mean": 0.4375,
+      "rewards/accuracy_reward/std": 0.49802759289741516,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998494386672974,
+      "sampling/importance_sampling_ratio/min": 0.00013992394087836146,
+      "sampling/sampling_logp_difference/max": 8.874411582946777,
+      "sampling/sampling_logp_difference/mean": 0.019147861748933792,
+      "step": 48
+    },
+    {
+      "clip_ratio/high_max": 1.1501961580506759e-05,
+      "clip_ratio/high_mean": 2.8754903951266897e-06,
+      "clip_ratio/low_mean": 4.08189714562468e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.369446196506033e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15594.0,
+      "completions/mean_length": 6262.46875,
+      "completions/mean_terminated_length": 5764.68798828125,
+      "completions/min_length": 210.0,
+      "completions/min_terminated_length": 210.0,
+      "entropy": 0.8599015846848488,
+      "epoch": 0.045078196872125116,
+      "frac_reward_zero_std": 0.125,
+      "grad_norm": 0.0029804729856550694,
+      "learning_rate": 1e-05,
+      "loss": 0.0495,
+      "num_tokens": 35924886.0,
+      "reward": 0.3984375,
+      "reward_std": 0.3911295533180237,
+      "rewards/accuracy_reward/mean": 0.3984375,
+      "rewards/accuracy_reward/std": 0.4915000796318054,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999922513961792,
+      "sampling/importance_sampling_ratio/min": 0.00021375219512265176,
+      "sampling/sampling_logp_difference/max": 9.904524803161621,
+      "sampling/sampling_logp_difference/mean": 0.01815103553235531,
+      "step": 49
+    },
+    {
+      "clip_ratio/high_max": 2.4107544049911667e-05,
+      "clip_ratio/high_mean": 6.026886012477917e-06,
+      "clip_ratio/low_mean": 3.6588148361715866e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.261503391944643e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 14556.0,
+      "completions/max_terminated_length": 14556.0,
+      "completions/mean_length": 5926.8984375,
+      "completions/mean_terminated_length": 5926.8984375,
+      "completions/min_length": 346.0,
+      "completions/min_terminated_length": 346.0,
+      "entropy": 1.0042993426322937,
+      "epoch": 0.045998160073597055,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0022071697749197483,
+      "learning_rate": 1e-05,
+      "loss": 0.0059,
+      "num_tokens": 36700913.0,
+      "reward": 0.3359375,
+      "reward_std": 0.3306073546409607,
+      "rewards/accuracy_reward/mean": 0.3359375,
+      "rewards/accuracy_reward/std": 0.47417303919792175,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.000010371208191,
+      "sampling/importance_sampling_ratio/min": 0.0005220364546403289,
+      "sampling/sampling_logp_difference/max": 7.557773113250732,
+      "sampling/sampling_logp_difference/mean": 0.01954064890742302,
+      "step": 50
+    },
+    {
+      "clip_ratio/high_max": 4.9106265578302555e-06,
+      "clip_ratio/high_mean": 1.2276566394575639e-06,
+      "clip_ratio/low_mean": 2.634599570683349e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.7573652346291055e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15217.0,
+      "completions/mean_length": 6873.6875,
+      "completions/mean_terminated_length": 6645.4404296875,
+      "completions/min_length": 505.0,
+      "completions/min_terminated_length": 505.0,
+      "entropy": 1.0255412608385086,
+      "epoch": 0.046918123275068994,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.002320924773812294,
+      "learning_rate": 1e-05,
+      "loss": 0.0508,
+      "num_tokens": 37604865.0,
+      "reward": 0.234375,
+      "reward_std": 0.3135228157043457,
+      "rewards/accuracy_reward/mean": 0.234375,
+      "rewards/accuracy_reward/std": 0.42527204751968384,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999098777770996,
+      "sampling/importance_sampling_ratio/min": 0.026153141632676125,
+      "sampling/sampling_logp_difference/max": 3.6437859535217285,
+      "sampling/sampling_logp_difference/mean": 0.019532475620508194,
+      "step": 51
+    },
+    {
+      "clip_ratio/high_max": 1.6350510122720152e-05,
+      "clip_ratio/high_mean": 4.087627530680038e-06,
+      "clip_ratio/low_mean": 2.351988746340794e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.7607515221461654e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15668.0,
+      "completions/mean_length": 6073.8984375,
+      "completions/mean_terminated_length": 5992.71630859375,
+      "completions/min_length": 654.0,
+      "completions/min_terminated_length": 654.0,
+      "entropy": 1.0713753998279572,
+      "epoch": 0.04783808647654094,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.002212709980085492,
+      "learning_rate": 1e-05,
+      "loss": 0.0668,
+      "num_tokens": 38405196.0,
+      "reward": 0.359375,
+      "reward_std": 0.22119548916816711,
+      "rewards/accuracy_reward/mean": 0.359375,
+      "rewards/accuracy_reward/std": 0.481702595949173,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998978972434998,
+      "sampling/importance_sampling_ratio/min": 8.706459084351081e-06,
+      "sampling/sampling_logp_difference/max": 11.651445388793945,
+      "sampling/sampling_logp_difference/mean": 0.021252838894724846,
+      "step": 52
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 3.729486718384578e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.729486718384578e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15299.0,
+      "completions/mean_length": 5838.71875,
+      "completions/mean_terminated_length": 5671.33349609375,
+      "completions/min_length": 331.0,
+      "completions/min_terminated_length": 331.0,
+      "entropy": 1.021155133843422,
+      "epoch": 0.04875804967801288,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.001135052996687591,
+      "learning_rate": 1e-05,
+      "loss": 0.0178,
+      "num_tokens": 39171704.0,
+      "reward": 0.28125,
+      "reward_std": 0.23410367965698242,
+      "rewards/accuracy_reward/mean": 0.28125,
+      "rewards/accuracy_reward/std": 0.4513758420944214,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999173879623413,
+      "sampling/importance_sampling_ratio/min": 0.003084881929680705,
+      "sampling/sampling_logp_difference/max": 5.7812418937683105,
+      "sampling/sampling_logp_difference/mean": 0.020781882107257843,
+      "step": 53
+    },
+    {
+      "clip_ratio/high_max": 1.7124169744420215e-05,
+      "clip_ratio/high_mean": 4.281042436105054e-06,
+      "clip_ratio/low_mean": 3.706903294187214e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.135007543482061e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 14617.0,
+      "completions/max_terminated_length": 14617.0,
+      "completions/mean_length": 6358.5859375,
+      "completions/mean_terminated_length": 6358.5859375,
+      "completions/min_length": 940.0,
+      "completions/min_terminated_length": 940.0,
+      "entropy": 0.9720487147569656,
+      "epoch": 0.04967801287948482,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.002638082252815366,
+      "learning_rate": 1e-05,
+      "loss": 0.0145,
+      "num_tokens": 40003859.0,
+      "reward": 0.40625,
+      "reward_std": 0.3174618184566498,
+      "rewards/accuracy_reward/mean": 0.40625,
+      "rewards/accuracy_reward/std": 0.4930621087551117,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000380277633667,
+      "sampling/importance_sampling_ratio/min": 0.01960253342986107,
+      "sampling/sampling_logp_difference/max": 3.932096481323242,
+      "sampling/sampling_logp_difference/mean": 0.01991666667163372,
+      "step": 54
+    },
+    {
+      "clip_ratio/high_max": 6.55582925901399e-06,
+      "clip_ratio/high_mean": 2.994117721755174e-06,
+      "clip_ratio/low_mean": 2.222621503733535e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.5220332759090525e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 14753.0,
+      "completions/max_terminated_length": 14753.0,
+      "completions/mean_length": 4634.1875,
+      "completions/mean_terminated_length": 4634.1875,
+      "completions/min_length": 266.0,
+      "completions/min_terminated_length": 266.0,
+      "entropy": 0.9715309366583824,
+      "epoch": 0.050597976080956765,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.001994960242882371,
+      "learning_rate": 1e-05,
+      "loss": 0.0221,
+      "num_tokens": 40616483.0,
+      "reward": 0.4375,
+      "reward_std": 0.29644322395324707,
+      "rewards/accuracy_reward/mean": 0.4375,
+      "rewards/accuracy_reward/std": 0.49802759289741516,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000698566436768,
+      "sampling/importance_sampling_ratio/min": 1.0510009815334342e-05,
+      "sampling/sampling_logp_difference/max": 11.46318244934082,
+      "sampling/sampling_logp_difference/mean": 0.01902047172188759,
+      "step": 55
+    },
+    {
+      "clip_ratio/high_max": 2.2474248908110894e-05,
+      "clip_ratio/high_mean": 7.571314540655294e-06,
+      "clip_ratio/low_mean": 4.3583780325207044e-05,
+      "clip_ratio/low_min": 4.6013396968191955e-06,
+      "clip_ratio/region_mean": 5.1155094070054474e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15953.0,
+      "completions/mean_length": 6596.25,
+      "completions/mean_terminated_length": 6361.34423828125,
+      "completions/min_length": 318.0,
+      "completions/min_terminated_length": 318.0,
+      "entropy": 0.8207943215966225,
+      "epoch": 0.051517939282428704,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0019902780186384916,
+      "learning_rate": 1e-05,
+      "loss": 0.0506,
+      "num_tokens": 41484443.0,
+      "reward": 0.4453125,
+      "reward_std": 0.326668381690979,
+      "rewards/accuracy_reward/mean": 0.4453125,
+      "rewards/accuracy_reward/std": 0.4989531338214874,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.000016689300537,
+      "sampling/importance_sampling_ratio/min": 7.485233072657138e-05,
+      "sampling/sampling_logp_difference/max": 9.499993324279785,
+      "sampling/sampling_logp_difference/mean": 0.018301833420991898,
+      "step": 56
+    },
+    {
+      "clip_ratio/high_max": 3.0019932637515012e-06,
+      "clip_ratio/high_mean": 7.504983159378753e-07,
+      "clip_ratio/low_mean": 4.332785601945943e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.407835376696312e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15834.0,
+      "completions/mean_length": 6785.75,
+      "completions/mean_terminated_length": 6313.70458984375,
+      "completions/min_length": 393.0,
+      "completions/min_terminated_length": 393.0,
+      "entropy": 0.9876058474183083,
+      "epoch": 0.05243790248390064,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0015235114842653275,
+      "learning_rate": 1e-05,
+      "loss": 0.0128,
+      "num_tokens": 42372235.0,
+      "reward": 0.2421875,
+      "reward_std": 0.325075626373291,
+      "rewards/accuracy_reward/mean": 0.2421875,
+      "rewards/accuracy_reward/std": 0.4300905168056488,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999551773071289,
+      "sampling/importance_sampling_ratio/min": 0.026679370552301407,
+      "sampling/sampling_logp_difference/max": 3.6238646507263184,
+      "sampling/sampling_logp_difference/mean": 0.019945615902543068,
+      "step": 57
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 2.1349006601667497e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.1349006601667497e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14726.0,
+      "completions/mean_length": 4881.2109375,
+      "completions/mean_terminated_length": 4510.1533203125,
+      "completions/min_length": 437.0,
+      "completions/min_terminated_length": 437.0,
+      "entropy": 0.989942155778408,
+      "epoch": 0.05335786568537258,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.002033712575212121,
+      "learning_rate": 1e-05,
+      "loss": 0.1088,
+      "num_tokens": 43015238.0,
+      "reward": 0.4375,
+      "reward_std": 0.2869548797607422,
+      "rewards/accuracy_reward/mean": 0.4375,
+      "rewards/accuracy_reward/std": 0.49802759289741516,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000300407409668,
+      "sampling/importance_sampling_ratio/min": 0.0001238943514181301,
+      "sampling/sampling_logp_difference/max": 8.996081352233887,
+      "sampling/sampling_logp_difference/mean": 0.01887543685734272,
+      "step": 58
+    },
+    {
+      "clip_ratio/high_max": 2.584004687378183e-05,
+      "clip_ratio/high_mean": 6.4600117184454575e-06,
+      "clip_ratio/low_mean": 2.1371045761497953e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.7831058105221018e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15001.0,
+      "completions/max_terminated_length": 15001.0,
+      "completions/mean_length": 4725.3984375,
+      "completions/mean_terminated_length": 4725.3984375,
+      "completions/min_length": 117.0,
+      "completions/min_terminated_length": 117.0,
+      "entropy": 1.0350637435913086,
+      "epoch": 0.05427782888684453,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0030296226032078266,
+      "learning_rate": 1e-05,
+      "loss": 0.0691,
+      "num_tokens": 43637737.0,
+      "reward": 0.4453125,
+      "reward_std": 0.32035762071609497,
+      "rewards/accuracy_reward/mean": 0.4453125,
+      "rewards/accuracy_reward/std": 0.4989531338214874,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999939203262329,
+      "sampling/importance_sampling_ratio/min": 0.00022932067804504186,
+      "sampling/sampling_logp_difference/max": 8.380389213562012,
+      "sampling/sampling_logp_difference/mean": 0.01995944231748581,
+      "step": 59
+    },
+    {
+      "clip_ratio/high_max": 1.994733975152485e-05,
+      "clip_ratio/high_mean": 4.986834937881213e-06,
+      "clip_ratio/low_mean": 3.5168303838872816e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.015513832200668e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16240.0,
+      "completions/mean_length": 4918.171875,
+      "completions/mean_terminated_length": 4736.1748046875,
+      "completions/min_length": 388.0,
+      "completions/min_terminated_length": 388.0,
+      "entropy": 0.965274304151535,
+      "epoch": 0.05519779208831647,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.002758471528068185,
+      "learning_rate": 1e-05,
+      "loss": 0.0845,
+      "num_tokens": 44285327.0,
+      "reward": 0.328125,
+      "reward_std": 0.27328526973724365,
+      "rewards/accuracy_reward/mean": 0.328125,
+      "rewards/accuracy_reward/std": 0.4713755249977112,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999663233757019,
+      "sampling/importance_sampling_ratio/min": 0.010958661325275898,
+      "sampling/sampling_logp_difference/max": 4.513625144958496,
+      "sampling/sampling_logp_difference/mean": 0.019083233550190926,
+      "step": 60
+    },
+    {
+      "clip_ratio/high_max": 1.0621563887980301e-05,
+      "clip_ratio/high_mean": 2.6553909719950752e-06,
+      "clip_ratio/low_mean": 3.838553107016196e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.1040922042157035e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15031.0,
+      "completions/mean_length": 4998.2890625,
+      "completions/mean_terminated_length": 4908.6376953125,
+      "completions/min_length": 524.0,
+      "completions/min_terminated_length": 524.0,
+      "entropy": 0.9200445115566254,
+      "epoch": 0.05611775528978841,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.0027611786499619484,
+      "learning_rate": 1e-05,
+      "loss": 0.0575,
+      "num_tokens": 44944356.0,
+      "reward": 0.3515625,
+      "reward_std": 0.3895368278026581,
+      "rewards/accuracy_reward/mean": 0.3515625,
+      "rewards/accuracy_reward/std": 0.4793342351913452,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999884366989136,
+      "sampling/importance_sampling_ratio/min": 0.0018651526188477874,
+      "sampling/sampling_logp_difference/max": 6.284412384033203,
+      "sampling/sampling_logp_difference/mean": 0.017853498458862305,
+      "step": 61
+    },
+    {
+      "clip_ratio/high_max": 1.0136624496226432e-05,
+      "clip_ratio/high_mean": 2.534156124056608e-06,
+      "clip_ratio/low_mean": 2.0260404085092887e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.2794560095462657e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16110.0,
+      "completions/mean_length": 6290.1796875,
+      "completions/mean_terminated_length": 6129.96044921875,
+      "completions/min_length": 302.0,
+      "completions/min_terminated_length": 302.0,
+      "entropy": 0.9360214695334435,
+      "epoch": 0.05703771849126035,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0015557854203507304,
+      "learning_rate": 1e-05,
+      "loss": 0.0111,
+      "num_tokens": 45767867.0,
+      "reward": 0.34375,
+      "reward_std": 0.30168038606643677,
+      "rewards/accuracy_reward/mean": 0.34375,
+      "rewards/accuracy_reward/std": 0.47682511806488037,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999427795410156,
+      "sampling/importance_sampling_ratio/min": 0.0011004531988874078,
+      "sampling/sampling_logp_difference/max": 6.812033176422119,
+      "sampling/sampling_logp_difference/mean": 0.0200855303555727,
+      "step": 62
+    },
+    {
+      "clip_ratio/high_max": 2.2559511307918e-06,
+      "clip_ratio/high_mean": 5.6398778269795e-07,
+      "clip_ratio/low_mean": 4.51761221711422e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.574010984015331e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16366.0,
+      "completions/mean_length": 6486.15625,
+      "completions/mean_terminated_length": 6248.6083984375,
+      "completions/min_length": 77.0,
+      "completions/min_terminated_length": 77.0,
+      "entropy": 0.863138921558857,
+      "epoch": 0.05795768169273229,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0026953541673719883,
+      "learning_rate": 1e-05,
+      "loss": -0.0194,
+      "num_tokens": 46618575.0,
+      "reward": 0.2578125,
+      "reward_std": 0.2580180764198303,
+      "rewards/accuracy_reward/mean": 0.2578125,
+      "rewards/accuracy_reward/std": 0.43914902210235596,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999406337738037,
+      "sampling/importance_sampling_ratio/min": 0.0011708897072821856,
+      "sampling/sampling_logp_difference/max": 6.749991416931152,
+      "sampling/sampling_logp_difference/mean": 0.01863238587975502,
+      "step": 63
+    },
+    {
+      "clip_ratio/high_max": 1.0073357771034352e-05,
+      "clip_ratio/high_mean": 2.518339442758588e-06,
+      "clip_ratio/low_mean": 2.787370635815023e-05,
+      "clip_ratio/low_min": 3.837534222839167e-06,
+      "clip_ratio/region_mean": 3.0392045573535142e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16010.0,
+      "completions/mean_length": 6442.7734375,
+      "completions/mean_terminated_length": 6284.9765625,
+      "completions/min_length": 776.0,
+      "completions/min_terminated_length": 776.0,
+      "entropy": 1.0242054909467697,
+      "epoch": 0.05887764489420423,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0024442619178444147,
+      "learning_rate": 1e-05,
+      "loss": 0.0569,
+      "num_tokens": 47462274.0,
+      "reward": 0.328125,
+      "reward_std": 0.2777610421180725,
+      "rewards/accuracy_reward/mean": 0.328125,
+      "rewards/accuracy_reward/std": 0.4713755249977112,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998892545700073,
+      "sampling/importance_sampling_ratio/min": 4.9445447736218284e-09,
+      "sampling/sampling_logp_difference/max": 19.124980926513672,
+      "sampling/sampling_logp_difference/mean": 0.019810764119029045,
+      "step": 64
+    },
+    {
+      "clip_ratio/high_max": 1.220810372615233e-05,
+      "clip_ratio/high_mean": 3.0520259315380827e-06,
+      "clip_ratio/low_mean": 4.339240456374682e-05,
+      "clip_ratio/low_min": 4.491233084991109e-06,
+      "clip_ratio/region_mean": 4.644443038159807e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15624.0,
+      "completions/mean_length": 4807.765625,
+      "completions/mean_terminated_length": 4716.6142578125,
+      "completions/min_length": 272.0,
+      "completions/min_terminated_length": 272.0,
+      "entropy": 1.045751042664051,
+      "epoch": 0.05979760809567617,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.002512057079002261,
+      "learning_rate": 1e-05,
+      "loss": 0.003,
+      "num_tokens": 48096692.0,
+      "reward": 0.3671875,
+      "reward_std": 0.3435155153274536,
+      "rewards/accuracy_reward/mean": 0.3671875,
+      "rewards/accuracy_reward/std": 0.4839322865009308,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999058842658997,
+      "sampling/importance_sampling_ratio/min": 1.1480136890895665e-05,
+      "sampling/sampling_logp_difference/max": 11.374892234802246,
+      "sampling/sampling_logp_difference/mean": 0.01960371434688568,
+      "step": 65
+    },
+    {
+      "clip_ratio/high_max": 5.37941218681226e-06,
+      "clip_ratio/high_mean": 1.344853046703065e-06,
+      "clip_ratio/low_mean": 3.0161771633174794e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.1506624850408116e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16264.0,
+      "completions/mean_length": 6703.8359375,
+      "completions/mean_terminated_length": 6471.51220703125,
+      "completions/min_length": 813.0,
+      "completions/min_terminated_length": 813.0,
+      "entropy": 1.0592866837978363,
+      "epoch": 0.06071757129714812,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0016389708034694195,
+      "learning_rate": 1e-05,
+      "loss": -0.024,
+      "num_tokens": 48974399.0,
+      "reward": 0.2734375,
+      "reward_std": 0.2585548758506775,
+      "rewards/accuracy_reward/mean": 0.2734375,
+      "rewards/accuracy_reward/std": 0.447474867105484,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999353885650635,
+      "sampling/importance_sampling_ratio/min": 7.4113349910476245e-06,
+      "sampling/sampling_logp_difference/max": 11.8125,
+      "sampling/sampling_logp_difference/mean": 0.020880095660686493,
+      "step": 66
+    },
+    {
+      "clip_ratio/high_max": 7.093600515872822e-06,
+      "clip_ratio/high_mean": 1.7734001289682055e-06,
+      "clip_ratio/low_mean": 4.470584758564655e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.647924811251869e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16295.0,
+      "completions/mean_length": 6140.5078125,
+      "completions/mean_terminated_length": 5724.10546875,
+      "completions/min_length": 462.0,
+      "completions/min_terminated_length": 462.0,
+      "entropy": 1.0998501181602478,
+      "epoch": 0.061637534498620056,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.003946912474930286,
+      "learning_rate": 1e-05,
+      "loss": 0.0448,
+      "num_tokens": 49779920.0,
+      "reward": 0.34375,
+      "reward_std": 0.36796674132347107,
+      "rewards/accuracy_reward/mean": 0.34375,
+      "rewards/accuracy_reward/std": 0.47682511806488037,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999687671661377,
+      "sampling/importance_sampling_ratio/min": 2.849436668839189e-07,
+      "sampling/sampling_logp_difference/max": 15.070974349975586,
+      "sampling/sampling_logp_difference/mean": 0.021355850622057915,
+      "step": 67
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 3.313956779038563e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.313956779038563e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16352.0,
+      "completions/mean_length": 6689.8046875,
+      "completions/mean_terminated_length": 6213.04052734375,
+      "completions/min_length": 422.0,
+      "completions/min_terminated_length": 422.0,
+      "entropy": 0.8561654165387154,
+      "epoch": 0.062557497700092,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.0021656695753335953,
+      "learning_rate": 1e-05,
+      "loss": 0.0283,
+      "num_tokens": 50655023.0,
+      "reward": 0.203125,
+      "reward_std": 0.21723884344100952,
+      "rewards/accuracy_reward/mean": 0.203125,
+      "rewards/accuracy_reward/std": 0.40390563011169434,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999941885471344,
+      "sampling/importance_sampling_ratio/min": 2.836359499269747e-06,
+      "sampling/sampling_logp_difference/max": 12.772989273071289,
+      "sampling/sampling_logp_difference/mean": 0.01873670145869255,
+      "step": 68
+    },
+    {
+      "clip_ratio/high_max": 2.3421607693308033e-05,
+      "clip_ratio/high_mean": 7.242933975248889e-06,
+      "clip_ratio/low_mean": 3.896083626386826e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.620377103492501e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 14330.0,
+      "completions/max_terminated_length": 14330.0,
+      "completions/mean_length": 5707.0078125,
+      "completions/mean_terminated_length": 5707.0078125,
+      "completions/min_length": 625.0,
+      "completions/min_terminated_length": 625.0,
+      "entropy": 1.1396166533231735,
+      "epoch": 0.06347746090156393,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.004121148493140936,
+      "learning_rate": 1e-05,
+      "loss": 0.0397,
+      "num_tokens": 51406536.0,
+      "reward": 0.3125,
+      "reward_std": 0.3066929578781128,
+      "rewards/accuracy_reward/mean": 0.3125,
+      "rewards/accuracy_reward/std": 0.4653336703777313,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999328851699829,
+      "sampling/importance_sampling_ratio/min": 0.0005196487763896585,
+      "sampling/sampling_logp_difference/max": 7.562357425689697,
+      "sampling/sampling_logp_difference/mean": 0.020000409334897995,
+      "step": 69
+    },
+    {
+      "clip_ratio/high_max": 1.82290532393381e-05,
+      "clip_ratio/high_mean": 4.557263309834525e-06,
+      "clip_ratio/low_mean": 2.5275351731579576e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.9832615496161452e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15987.0,
+      "completions/mean_length": 5655.6328125,
+      "completions/mean_terminated_length": 5571.1572265625,
+      "completions/min_length": 157.0,
+      "completions/min_terminated_length": 157.0,
+      "entropy": 0.8928132206201553,
+      "epoch": 0.06439742410303588,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0032538517843931913,
+      "learning_rate": 1e-05,
+      "loss": 0.0627,
+      "num_tokens": 52148473.0,
+      "reward": 0.3984375,
+      "reward_std": 0.29432642459869385,
+      "rewards/accuracy_reward/mean": 0.3984375,
+      "rewards/accuracy_reward/std": 0.4915000796318054,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000033378601074,
+      "sampling/importance_sampling_ratio/min": 0.0017573959194123745,
+      "sampling/sampling_logp_difference/max": 6.343922138214111,
+      "sampling/sampling_logp_difference/mean": 0.018881790339946747,
+      "step": 70
+    },
+    {
+      "clip_ratio/high_max": 1.2836022506235167e-05,
+      "clip_ratio/high_mean": 3.209005626558792e-06,
+      "clip_ratio/low_mean": 3.8109637216621195e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.131864307055366e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16323.0,
+      "completions/mean_length": 7399.7890625,
+      "completions/mean_terminated_length": 7034.5771484375,
+      "completions/min_length": 144.0,
+      "completions/min_terminated_length": 144.0,
+      "entropy": 0.8808257132768631,
+      "epoch": 0.06531738730450783,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.002061733277514577,
+      "learning_rate": 1e-05,
+      "loss": 0.0191,
+      "num_tokens": 53113230.0,
+      "reward": 0.3046875,
+      "reward_std": 0.2580180764198303,
+      "rewards/accuracy_reward/mean": 0.3046875,
+      "rewards/accuracy_reward/std": 0.46208351850509644,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999673962593079,
+      "sampling/importance_sampling_ratio/min": 0.005283349193632603,
+      "sampling/sampling_logp_difference/max": 5.243195056915283,
+      "sampling/sampling_logp_difference/mean": 0.018456293269991875,
+      "step": 71
+    },
+    {
+      "clip_ratio/high_max": 1.5806871488166507e-05,
+      "clip_ratio/high_mean": 4.739466817227367e-06,
+      "clip_ratio/low_mean": 3.610486896832299e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.084433521711617e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16208.0,
+      "completions/mean_length": 5730.9609375,
+      "completions/mean_terminated_length": 5475.2880859375,
+      "completions/min_length": 433.0,
+      "completions/min_terminated_length": 433.0,
+      "entropy": 0.9486126750707626,
+      "epoch": 0.06623735050597976,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0012298432411625981,
+      "learning_rate": 1e-05,
+      "loss": 0.0208,
+      "num_tokens": 53864049.0,
+      "reward": 0.359375,
+      "reward_std": 0.25460314750671387,
+      "rewards/accuracy_reward/mean": 0.359375,
+      "rewards/accuracy_reward/std": 0.481702595949173,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999348521232605,
+      "sampling/importance_sampling_ratio/min": 4.832820559386164e-05,
+      "sampling/sampling_logp_difference/max": 9.937495231628418,
+      "sampling/sampling_logp_difference/mean": 0.01919996738433838,
+      "step": 72
+    },
+    {
+      "clip_ratio/high_max": 1.2390134997986024e-05,
+      "clip_ratio/high_mean": 3.097533749496506e-06,
+      "clip_ratio/low_mean": 3.8867822581778455e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.19653564449618e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 13500.0,
+      "completions/mean_length": 4620.5703125,
+      "completions/mean_terminated_length": 4527.94482421875,
+      "completions/min_length": 364.0,
+      "completions/min_terminated_length": 364.0,
+      "entropy": 0.9557560831308365,
+      "epoch": 0.0671573137074517,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.002882040338590741,
+      "learning_rate": 1e-05,
+      "loss": 0.0609,
+      "num_tokens": 54473498.0,
+      "reward": 0.3984375,
+      "reward_std": 0.39294686913490295,
+      "rewards/accuracy_reward/mean": 0.3984375,
+      "rewards/accuracy_reward/std": 0.4915000796318054,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998915195465088,
+      "sampling/importance_sampling_ratio/min": 1.577107298089686e-07,
+      "sampling/sampling_logp_difference/max": 15.662503242492676,
+      "sampling/sampling_logp_difference/mean": 0.018525000661611557,
+      "step": 73
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 3.088819471486204e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.088819471486204e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 16314.0,
+      "completions/max_terminated_length": 16314.0,
+      "completions/mean_length": 5074.0703125,
+      "completions/mean_terminated_length": 5074.0703125,
+      "completions/min_length": 342.0,
+      "completions/min_terminated_length": 342.0,
+      "entropy": 0.8830869868397713,
+      "epoch": 0.06807727690892364,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.003324020653963089,
+      "learning_rate": 1e-05,
+      "loss": 0.0305,
+      "num_tokens": 55141787.0,
+      "reward": 0.4609375,
+      "reward_std": 0.30115634202957153,
+      "rewards/accuracy_reward/mean": 0.4609375,
+      "rewards/accuracy_reward/std": 0.5004304051399231,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999203681945801,
+      "sampling/importance_sampling_ratio/min": 0.0009876838885247707,
+      "sampling/sampling_logp_difference/max": 6.920147895812988,
+      "sampling/sampling_logp_difference/mean": 0.018072880804538727,
+      "step": 74
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 1.526649884908693e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 1.526649884908693e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15251.0,
+      "completions/max_terminated_length": 15251.0,
+      "completions/mean_length": 6192.1015625,
+      "completions/mean_terminated_length": 6192.1015625,
+      "completions/min_length": 553.0,
+      "completions/min_terminated_length": 553.0,
+      "entropy": 1.0888547226786613,
+      "epoch": 0.06899724011039558,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0017452294705435634,
+      "learning_rate": 1e-05,
+      "loss": 0.0216,
+      "num_tokens": 55954144.0,
+      "reward": 0.2890625,
+      "reward_std": 0.23250606656074524,
+      "rewards/accuracy_reward/mean": 0.2890625,
+      "rewards/accuracy_reward/std": 0.45510825514793396,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999473690986633,
+      "sampling/importance_sampling_ratio/min": 5.061922365712235e-07,
+      "sampling/sampling_logp_difference/max": 14.496349334716797,
+      "sampling/sampling_logp_difference/mean": 0.021221645176410675,
+      "step": 75
+    },
+    {
+      "clip_ratio/high_max": 1.6768677141953958e-05,
+      "clip_ratio/high_mean": 5.080836899651331e-06,
+      "clip_ratio/low_mean": 3.340929970363504e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.84901372854074e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15740.0,
+      "completions/mean_length": 6204.296875,
+      "completions/mean_terminated_length": 6124.1416015625,
+      "completions/min_length": 294.0,
+      "completions/min_terminated_length": 294.0,
+      "entropy": 1.0423575639724731,
+      "epoch": 0.06991720331186753,
+      "frac_reward_zero_std": 0.125,
+      "grad_norm": 0.0033357341308146715,
+      "learning_rate": 1e-05,
+      "loss": 0.1073,
+      "num_tokens": 56765470.0,
+      "reward": 0.3359375,
+      "reward_std": 0.37875816226005554,
+      "rewards/accuracy_reward/mean": 0.3359375,
+      "rewards/accuracy_reward/std": 0.47417303919792175,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.99998539686203,
+      "sampling/importance_sampling_ratio/min": 4.564182381727733e-05,
+      "sampling/sampling_logp_difference/max": 9.994686126708984,
+      "sampling/sampling_logp_difference/mean": 0.01908688060939312,
+      "step": 76
+    },
+    {
+      "clip_ratio/high_max": 3.149884150843718e-06,
+      "clip_ratio/high_mean": 7.874710377109295e-07,
+      "clip_ratio/low_mean": 2.430614893000893e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.509361991087644e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 14409.0,
+      "completions/max_terminated_length": 14409.0,
+      "completions/mean_length": 5070.3125,
+      "completions/mean_terminated_length": 5070.3125,
+      "completions/min_length": 629.0,
+      "completions/min_terminated_length": 629.0,
+      "entropy": 1.0737399458885193,
+      "epoch": 0.07083716651333946,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0038695367984473705,
+      "learning_rate": 1e-05,
+      "loss": 0.0015,
+      "num_tokens": 57432958.0,
+      "reward": 0.390625,
+      "reward_std": 0.22119548916816711,
+      "rewards/accuracy_reward/mean": 0.390625,
+      "rewards/accuracy_reward/std": 0.4898075461387634,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999223947525024,
+      "sampling/importance_sampling_ratio/min": 1.5509348259001854e-06,
+      "sampling/sampling_logp_difference/max": 13.376652717590332,
+      "sampling/sampling_logp_difference/mean": 0.01970684342086315,
+      "step": 77
+    },
+    {
+      "clip_ratio/high_max": 1.9821940441033803e-05,
+      "clip_ratio/high_mean": 4.955485110258451e-06,
+      "clip_ratio/low_mean": 2.9055729555693688e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.401121466595214e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15799.0,
+      "completions/mean_length": 5750.21875,
+      "completions/mean_terminated_length": 5495.00830078125,
+      "completions/min_length": 76.0,
+      "completions/min_terminated_length": 76.0,
+      "entropy": 0.9708107560873032,
+      "epoch": 0.07175712971481141,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.002927646040916443,
+      "learning_rate": 1e-05,
+      "loss": 0.0166,
+      "num_tokens": 58187426.0,
+      "reward": 0.296875,
+      "reward_std": 0.25460314750671387,
+      "rewards/accuracy_reward/mean": 0.296875,
+      "rewards/accuracy_reward/std": 0.45867621898651123,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999390840530396,
+      "sampling/importance_sampling_ratio/min": 0.015204614959657192,
+      "sampling/sampling_logp_difference/max": 4.186156272888184,
+      "sampling/sampling_logp_difference/mean": 0.019483914598822594,
+      "step": 78
+    },
+    {
+      "clip_ratio/high_max": 2.3815636723156786e-05,
+      "clip_ratio/high_mean": 5.953909180789196e-06,
+      "clip_ratio/low_mean": 4.989707144886779e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.585097960647545e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15938.0,
+      "completions/mean_length": 6067.484375,
+      "completions/mean_terminated_length": 5986.251953125,
+      "completions/min_length": 656.0,
+      "completions/min_terminated_length": 656.0,
+      "entropy": 0.9576351121068001,
+      "epoch": 0.07267709291628335,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.0026169484481215477,
+      "learning_rate": 1e-05,
+      "loss": -0.0055,
+      "num_tokens": 58983336.0,
+      "reward": 0.390625,
+      "reward_std": 0.3406373858451843,
+      "rewards/accuracy_reward/mean": 0.390625,
+      "rewards/accuracy_reward/std": 0.4898075461387634,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999620914459229,
+      "sampling/importance_sampling_ratio/min": 1.974713995878119e-06,
+      "sampling/sampling_logp_difference/max": 13.135087013244629,
+      "sampling/sampling_logp_difference/mean": 0.019007554277777672,
+      "step": 79
+    },
+    {
+      "clip_ratio/high_max": 2.4238934656750644e-05,
+      "clip_ratio/high_mean": 7.786730066072778e-06,
+      "clip_ratio/low_mean": 4.5700241571466904e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.3486972547034384e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 13640.0,
+      "completions/max_terminated_length": 13640.0,
+      "completions/mean_length": 4612.8984375,
+      "completions/mean_terminated_length": 4612.8984375,
+      "completions/min_length": 198.0,
+      "completions/min_terminated_length": 198.0,
+      "entropy": 0.9636320173740387,
+      "epoch": 0.07359705611775529,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0015429699560627341,
+      "learning_rate": 1e-05,
+      "loss": -0.018,
+      "num_tokens": 59590763.0,
+      "reward": 0.421875,
+      "reward_std": 0.34139877557754517,
+      "rewards/accuracy_reward/mean": 0.421875,
+      "rewards/accuracy_reward/std": 0.4957992732524872,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999473094940186,
+      "sampling/importance_sampling_ratio/min": 2.5909587364481013e-08,
+      "sampling/sampling_logp_difference/max": 17.468652725219727,
+      "sampling/sampling_logp_difference/mean": 0.019313856959342957,
+      "step": 80
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 3.0911465842109465e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.0911465842109465e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16300.0,
+      "completions/mean_length": 6101.3125,
+      "completions/mean_terminated_length": 5854.5283203125,
+      "completions/min_length": 179.0,
+      "completions/min_terminated_length": 179.0,
+      "entropy": 0.8831139355897903,
+      "epoch": 0.07451701931922723,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0022505265660583973,
+      "learning_rate": 1e-05,
+      "loss": 0.0813,
+      "num_tokens": 60391283.0,
+      "reward": 0.3125,
+      "reward_std": 0.29302334785461426,
+      "rewards/accuracy_reward/mean": 0.3125,
+      "rewards/accuracy_reward/std": 0.4653336703777313,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000061988830566,
+      "sampling/importance_sampling_ratio/min": 0.0003816343960352242,
+      "sampling/sampling_logp_difference/max": 7.871047496795654,
+      "sampling/sampling_logp_difference/mean": 0.018377842381596565,
+      "step": 81
+    },
+    {
+      "clip_ratio/high_max": 1.547606643725885e-05,
+      "clip_ratio/high_mean": 3.869016609314713e-06,
+      "clip_ratio/low_mean": 2.478705800967873e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.8656074391619768e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14862.0,
+      "completions/mean_length": 4705.9921875,
+      "completions/mean_terminated_length": 4614.03955078125,
+      "completions/min_length": 86.0,
+      "completions/min_terminated_length": 86.0,
+      "entropy": 0.9557913094758987,
+      "epoch": 0.07543698252069918,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.002069958718493581,
+      "learning_rate": 1e-05,
+      "loss": -0.0015,
+      "num_tokens": 61021490.0,
+      "reward": 0.4296875,
+      "reward_std": 0.2637920379638672,
+      "rewards/accuracy_reward/mean": 0.4296875,
+      "rewards/accuracy_reward/std": 0.4969765841960907,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999030232429504,
+      "sampling/importance_sampling_ratio/min": 2.76673017651774e-05,
+      "sampling/sampling_logp_difference/max": 10.495259284973145,
+      "sampling/sampling_logp_difference/mean": 0.018629569560289383,
+      "step": 82
+    },
+    {
+      "clip_ratio/high_max": 2.0910484636260662e-05,
+      "clip_ratio/high_mean": 5.2276211590651656e-06,
+      "clip_ratio/low_mean": 1.952954164607945e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.4757162805144617e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 13745.0,
+      "completions/max_terminated_length": 13745.0,
+      "completions/mean_length": 5116.78125,
+      "completions/mean_terminated_length": 5116.78125,
+      "completions/min_length": 12.0,
+      "completions/min_terminated_length": 12.0,
+      "entropy": 1.0198405236005783,
+      "epoch": 0.07635694572217111,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0034461067989468575,
+      "learning_rate": 1e-05,
+      "loss": -0.0073,
+      "num_tokens": 61695382.0,
+      "reward": 0.265625,
+      "reward_std": 0.30774885416030884,
+      "rewards/accuracy_reward/mean": 0.265625,
+      "rewards/accuracy_reward/std": 0.44340085983276367,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999936819076538,
+      "sampling/importance_sampling_ratio/min": 0.012227212078869343,
+      "sampling/sampling_logp_difference/max": 4.4040913581848145,
+      "sampling/sampling_logp_difference/mean": 0.019400250166654587,
+      "step": 83
+    },
+    {
+      "clip_ratio/high_max": 1.5340228401328204e-05,
+      "clip_ratio/high_mean": 3.835057100332051e-06,
+      "clip_ratio/low_mean": 3.150914017169271e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.534419727202476e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15953.0,
+      "completions/mean_length": 5891.9140625,
+      "completions/mean_terminated_length": 5553.45947265625,
+      "completions/min_length": 79.0,
+      "completions/min_terminated_length": 79.0,
+      "entropy": 0.9568078517913818,
+      "epoch": 0.07727690892364306,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0025854657869786024,
+      "learning_rate": 1e-05,
+      "loss": 0.1013,
+      "num_tokens": 62474883.0,
+      "reward": 0.3203125,
+      "reward_std": 0.31010788679122925,
+      "rewards/accuracy_reward/mean": 0.3203125,
+      "rewards/accuracy_reward/std": 0.4684300124645233,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0001013278961182,
+      "sampling/importance_sampling_ratio/min": 0.0015072470996528864,
+      "sampling/sampling_logp_difference/max": 6.497470378875732,
+      "sampling/sampling_logp_difference/mean": 0.019574139267206192,
+      "step": 84
+    },
+    {
+      "clip_ratio/high_max": 1.108303422370227e-05,
+      "clip_ratio/high_mean": 2.7707585559255676e-06,
+      "clip_ratio/low_mean": 2.2325777763398946e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.5096536319324514e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 13671.0,
+      "completions/mean_length": 5300.3359375,
+      "completions/mean_terminated_length": 5213.06298828125,
+      "completions/min_length": 17.0,
+      "completions/min_terminated_length": 17.0,
+      "entropy": 0.9722280204296112,
+      "epoch": 0.078196872125115,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0025075653102248907,
+      "learning_rate": 1e-05,
+      "loss": 0.0312,
+      "num_tokens": 63172454.0,
+      "reward": 0.203125,
+      "reward_std": 0.21542152762413025,
+      "rewards/accuracy_reward/mean": 0.203125,
+      "rewards/accuracy_reward/std": 0.40390563011169434,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999643564224243,
+      "sampling/importance_sampling_ratio/min": 0.00020346972451079637,
+      "sampling/sampling_logp_difference/max": 8.499993324279785,
+      "sampling/sampling_logp_difference/mean": 0.02002432942390442,
+      "step": 85
+    },
+    {
+      "clip_ratio/high_max": 1.3991947980684927e-05,
+      "clip_ratio/high_mean": 3.4979869951712317e-06,
+      "clip_ratio/low_mean": 4.893367201930232e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.243165958290774e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15617.0,
+      "completions/mean_length": 6364.21875,
+      "completions/mean_terminated_length": 6205.1748046875,
+      "completions/min_length": 215.0,
+      "completions/min_terminated_length": 215.0,
+      "entropy": 1.0607495978474617,
+      "epoch": 0.07911683532658693,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0017982006538659334,
+      "learning_rate": 1e-05,
+      "loss": -0.0117,
+      "num_tokens": 64007602.0,
+      "reward": 0.2890625,
+      "reward_std": 0.21778053045272827,
+      "rewards/accuracy_reward/mean": 0.2890625,
+      "rewards/accuracy_reward/std": 0.45510825514793396,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000064373016357,
+      "sampling/importance_sampling_ratio/min": 3.823801307589747e-05,
+      "sampling/sampling_logp_difference/max": 10.171680450439453,
+      "sampling/sampling_logp_difference/mean": 0.020373597741127014,
+      "step": 86
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 2.6416430046083406e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.6416430046083406e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14709.0,
+      "completions/mean_length": 5746.3125,
+      "completions/mean_terminated_length": 5403.1611328125,
+      "completions/min_length": 287.0,
+      "completions/min_terminated_length": 287.0,
+      "entropy": 0.9913106113672256,
+      "epoch": 0.08003679852805888,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.002207317156717181,
+      "learning_rate": 1e-05,
+      "loss": 0.063,
+      "num_tokens": 64762058.0,
+      "reward": 0.34375,
+      "reward_std": 0.3264310359954834,
+      "rewards/accuracy_reward/mean": 0.34375,
+      "rewards/accuracy_reward/std": 0.47682511806488037,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999239444732666,
+      "sampling/importance_sampling_ratio/min": 5.3444750847120304e-08,
+      "sampling/sampling_logp_difference/max": 16.744617462158203,
+      "sampling/sampling_logp_difference/mean": 0.020608089864253998,
+      "step": 87
+    },
+    {
+      "clip_ratio/high_max": 1.2681661701208213e-05,
+      "clip_ratio/high_mean": 3.1704154253020533e-06,
+      "clip_ratio/low_mean": 3.541917828897567e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.85895939416514e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16292.0,
+      "completions/mean_length": 6088.5625,
+      "completions/mean_terminated_length": 5841.47216796875,
+      "completions/min_length": 159.0,
+      "completions/min_terminated_length": 159.0,
+      "entropy": 0.9040444120764732,
+      "epoch": 0.08095676172953081,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0012974507408216596,
+      "learning_rate": 1e-05,
+      "loss": 0.0401,
+      "num_tokens": 65561002.0,
+      "reward": 0.3671875,
+      "reward_std": 0.2477683573961258,
+      "rewards/accuracy_reward/mean": 0.3671875,
+      "rewards/accuracy_reward/std": 0.4839322865009308,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998487234115601,
+      "sampling/importance_sampling_ratio/min": 6.021501121722395e-06,
+      "sampling/sampling_logp_difference/max": 12.020174026489258,
+      "sampling/sampling_logp_difference/mean": 0.01939838007092476,
+      "step": 88
+    },
+    {
+      "clip_ratio/high_max": 7.807132533343975e-06,
+      "clip_ratio/high_mean": 1.9517831333359936e-06,
+      "clip_ratio/low_mean": 1.8564539345788944e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.05163223654381e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15021.0,
+      "completions/mean_length": 5765.5,
+      "completions/mean_terminated_length": 5510.65625,
+      "completions/min_length": 412.0,
+      "completions/min_terminated_length": 412.0,
+      "entropy": 0.9966336265206337,
+      "epoch": 0.08187672493100276,
+      "frac_reward_zero_std": 0.6875,
+      "grad_norm": 0.0013380619930103421,
+      "learning_rate": 1e-05,
+      "loss": 0.0522,
+      "num_tokens": 66318482.0,
+      "reward": 0.375,
+      "reward_std": 0.13994136452674866,
+      "rewards/accuracy_reward/mean": 0.375,
+      "rewards/accuracy_reward/std": 0.4860251843929291,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999471306800842,
+      "sampling/importance_sampling_ratio/min": 7.288413598871557e-06,
+      "sampling/sampling_logp_difference/max": 11.829224586486816,
+      "sampling/sampling_logp_difference/mean": 0.018109245225787163,
+      "step": 89
+    },
+    {
+      "clip_ratio/high_max": 1.7906912489706883e-05,
+      "clip_ratio/high_mean": 4.476728122426721e-06,
+      "clip_ratio/low_mean": 2.5812531305291486e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.0289259655091882e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16120.0,
+      "completions/mean_length": 5462.78125,
+      "completions/mean_terminated_length": 5200.67236328125,
+      "completions/min_length": 460.0,
+      "completions/min_terminated_length": 460.0,
+      "entropy": 0.9345141425728798,
+      "epoch": 0.0827966881324747,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0023930128663778305,
+      "learning_rate": 1e-05,
+      "loss": 0.0475,
+      "num_tokens": 67038582.0,
+      "reward": 0.46875,
+      "reward_std": 0.24435341358184814,
+      "rewards/accuracy_reward/mean": 0.46875,
+      "rewards/accuracy_reward/std": 0.5009832978248596,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999513030052185,
+      "sampling/importance_sampling_ratio/min": 0.008508839644491673,
+      "sampling/sampling_logp_difference/max": 4.7666497230529785,
+      "sampling/sampling_logp_difference/mean": 0.019220296293497086,
+      "step": 90
+    },
+    {
+      "clip_ratio/high_max": 1.551389118503721e-05,
+      "clip_ratio/high_mean": 3.878472796259302e-06,
+      "clip_ratio/low_mean": 3.239646628117043e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.6274939645863924e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15034.0,
+      "completions/max_terminated_length": 15034.0,
+      "completions/mean_length": 5547.5078125,
+      "completions/mean_terminated_length": 5547.5078125,
+      "completions/min_length": 14.0,
+      "completions/min_terminated_length": 14.0,
+      "entropy": 1.0511749312281609,
+      "epoch": 0.08371665133394664,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.0013633714988827705,
+      "learning_rate": 1e-05,
+      "loss": 0.0462,
+      "num_tokens": 67774487.0,
+      "reward": 0.203125,
+      "reward_std": 0.19438527524471283,
+      "rewards/accuracy_reward/mean": 0.203125,
+      "rewards/accuracy_reward/std": 0.40390563011169434,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999545216560364,
+      "sampling/importance_sampling_ratio/min": 1.0995515367540065e-05,
+      "sampling/sampling_logp_difference/max": 11.418023109436035,
+      "sampling/sampling_logp_difference/mean": 0.020328814163804054,
+      "step": 91
+    },
+    {
+      "clip_ratio/high_max": 1.5384989410449634e-05,
+      "clip_ratio/high_mean": 3.846247352612409e-06,
+      "clip_ratio/low_mean": 3.441604167164769e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.826228908110352e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14029.0,
+      "completions/mean_length": 5835.4140625,
+      "completions/mean_terminated_length": 5406.609375,
+      "completions/min_length": 384.0,
+      "completions/min_terminated_length": 384.0,
+      "entropy": 1.0024723336100578,
+      "epoch": 0.08463661453541858,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0036165034398436546,
+      "learning_rate": 1e-05,
+      "loss": 0.0373,
+      "num_tokens": 68541660.0,
+      "reward": 0.34375,
+      "reward_std": 0.3584783673286438,
+      "rewards/accuracy_reward/mean": 0.34375,
+      "rewards/accuracy_reward/std": 0.47682511806488037,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999669790267944,
+      "sampling/importance_sampling_ratio/min": 9.518130354990717e-06,
+      "sampling/sampling_logp_difference/max": 11.562312126159668,
+      "sampling/sampling_logp_difference/mean": 0.020469525828957558,
+      "step": 92
+    },
+    {
+      "clip_ratio/high_max": 6.105602551542688e-06,
+      "clip_ratio/high_mean": 1.526400637885672e-06,
+      "clip_ratio/low_mean": 5.3129634352444555e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.46560352177039e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15695.0,
+      "completions/mean_length": 6252.609375,
+      "completions/mean_terminated_length": 6172.83447265625,
+      "completions/min_length": 481.0,
+      "completions/min_terminated_length": 481.0,
+      "entropy": 1.0325519517064095,
+      "epoch": 0.08555657773689053,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0022011541295796633,
+      "learning_rate": 1e-05,
+      "loss": 0.036,
+      "num_tokens": 69365418.0,
+      "reward": 0.3828125,
+      "reward_std": 0.32301604747772217,
+      "rewards/accuracy_reward/mean": 0.3828125,
+      "rewards/accuracy_reward/std": 0.4879830479621887,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998809099197388,
+      "sampling/importance_sampling_ratio/min": 0.0005531083443202078,
+      "sampling/sampling_logp_difference/max": 7.4999566078186035,
+      "sampling/sampling_logp_difference/mean": 0.02079072594642639,
+      "step": 93
+    },
+    {
+      "clip_ratio/high_max": 4.348128641140647e-06,
+      "clip_ratio/high_mean": 1.0870321602851618e-06,
+      "clip_ratio/low_mean": 3.0097819148977578e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.118485085451539e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15316.0,
+      "completions/max_terminated_length": 15316.0,
+      "completions/mean_length": 5581.484375,
+      "completions/mean_terminated_length": 5581.484375,
+      "completions/min_length": 461.0,
+      "completions/min_terminated_length": 461.0,
+      "entropy": 0.9222500994801521,
+      "epoch": 0.08647654093836246,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.002300912281498313,
+      "learning_rate": 1e-05,
+      "loss": -0.0007,
+      "num_tokens": 70099320.0,
+      "reward": 0.296875,
+      "reward_std": 0.2959064245223999,
+      "rewards/accuracy_reward/mean": 0.296875,
+      "rewards/accuracy_reward/std": 0.45867621898651123,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998577833175659,
+      "sampling/importance_sampling_ratio/min": 8.140386853483506e-08,
+      "sampling/sampling_logp_difference/max": 16.323843002319336,
+      "sampling/sampling_logp_difference/mean": 0.01952272653579712,
+      "step": 94
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 3.5122252029395895e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.5122252029395895e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15781.0,
+      "completions/max_terminated_length": 15781.0,
+      "completions/mean_length": 5424.140625,
+      "completions/mean_terminated_length": 5424.140625,
+      "completions/min_length": 124.0,
+      "completions/min_terminated_length": 124.0,
+      "entropy": 1.0446564108133316,
+      "epoch": 0.08739650413983441,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0016312639927491546,
+      "learning_rate": 1e-05,
+      "loss": 0.0119,
+      "num_tokens": 70811474.0,
+      "reward": 0.359375,
+      "reward_std": 0.31246691942214966,
+      "rewards/accuracy_reward/mean": 0.359375,
+      "rewards/accuracy_reward/std": 0.481702595949173,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000094175338745,
+      "sampling/importance_sampling_ratio/min": 0.0021919538266956806,
+      "sampling/sampling_logp_difference/max": 6.12296199798584,
+      "sampling/sampling_logp_difference/mean": 0.019741754978895187,
+      "step": 95
+    },
+    {
+      "clip_ratio/high_max": 1.0354576261306647e-05,
+      "clip_ratio/high_mean": 3.496124691082514e-06,
+      "clip_ratio/low_mean": 4.096481598026003e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.446094089871622e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15755.0,
+      "completions/max_terminated_length": 15755.0,
+      "completions/mean_length": 5884.9609375,
+      "completions/mean_terminated_length": 5884.9609375,
+      "completions/min_length": 382.0,
+      "completions/min_terminated_length": 382.0,
+      "entropy": 0.9605691060423851,
+      "epoch": 0.08831646734130635,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0032865386456251144,
+      "learning_rate": 1e-05,
+      "loss": 0.0451,
+      "num_tokens": 71582701.0,
+      "reward": 0.4140625,
+      "reward_std": 0.3514111638069153,
+      "rewards/accuracy_reward/mean": 0.4140625,
+      "rewards/accuracy_reward/std": 0.49449479579925537,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999833106994629,
+      "sampling/importance_sampling_ratio/min": 1.149311810877407e-05,
+      "sampling/sampling_logp_difference/max": 11.373762130737305,
+      "sampling/sampling_logp_difference/mean": 0.019438734278082848,
+      "step": 96
+    },
+    {
+      "clip_ratio/high_max": 1.026998006636859e-05,
+      "clip_ratio/high_mean": 2.5674950165921473e-06,
+      "clip_ratio/low_mean": 3.5440503552308655e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.8007998455213965e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15361.0,
+      "completions/max_terminated_length": 15361.0,
+      "completions/mean_length": 4835.09375,
+      "completions/mean_terminated_length": 4835.09375,
+      "completions/min_length": 826.0,
+      "completions/min_terminated_length": 826.0,
+      "entropy": 0.9038172215223312,
+      "epoch": 0.08923643054277829,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.004721678793430328,
+      "learning_rate": 1e-05,
+      "loss": 0.1143,
+      "num_tokens": 72220025.0,
+      "reward": 0.4765625,
+      "reward_std": 0.38481879234313965,
+      "rewards/accuracy_reward/mean": 0.4765625,
+      "rewards/accuracy_reward/std": 0.5014128684997559,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.99994957447052,
+      "sampling/importance_sampling_ratio/min": 2.710051205667696e-07,
+      "sampling/sampling_logp_difference/max": 15.12112808227539,
+      "sampling/sampling_logp_difference/mean": 0.017888439819216728,
+      "step": 97
+    },
+    {
+      "clip_ratio/high_max": 2.93432283342554e-05,
+      "clip_ratio/high_mean": 9.56252398509605e-06,
+      "clip_ratio/low_mean": 4.7865792453194445e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.742831808674964e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14431.0,
+      "completions/mean_length": 5979.078125,
+      "completions/mean_terminated_length": 5897.1494140625,
+      "completions/min_length": 241.0,
+      "completions/min_terminated_length": 241.0,
+      "entropy": 1.0227951630949974,
+      "epoch": 0.09015639374425023,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0010532280430197716,
+      "learning_rate": 1e-05,
+      "loss": 0.0187,
+      "num_tokens": 73005515.0,
+      "reward": 0.2890625,
+      "reward_std": 0.30115631222724915,
+      "rewards/accuracy_reward/mean": 0.2890625,
+      "rewards/accuracy_reward/std": 0.45510825514793396,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999090433120728,
+      "sampling/importance_sampling_ratio/min": 0.00030157779110595584,
+      "sampling/sampling_logp_difference/max": 8.10648250579834,
+      "sampling/sampling_logp_difference/mean": 0.019633149728178978,
+      "step": 98
+    },
+    {
+      "clip_ratio/high_max": 4.203234766464448e-06,
+      "clip_ratio/high_mean": 1.050808691616112e-06,
+      "clip_ratio/low_mean": 2.5574990331733716e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.6625799137036665e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15886.0,
+      "completions/max_terminated_length": 15886.0,
+      "completions/mean_length": 4292.1796875,
+      "completions/mean_terminated_length": 4292.1796875,
+      "completions/min_length": 159.0,
+      "completions/min_terminated_length": 159.0,
+      "entropy": 0.8719984591007233,
+      "epoch": 0.09107635694572216,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0038324075285345316,
+      "learning_rate": 1e-05,
+      "loss": 0.0669,
+      "num_tokens": 73572794.0,
+      "reward": 0.4375,
+      "reward_std": 0.2972046136856079,
+      "rewards/accuracy_reward/mean": 0.4375,
+      "rewards/accuracy_reward/std": 0.49802759289741516,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999188780784607,
+      "sampling/importance_sampling_ratio/min": 0.015675775706768036,
+      "sampling/sampling_logp_difference/max": 4.155638694763184,
+      "sampling/sampling_logp_difference/mean": 0.018074234947562218,
+      "step": 99
+    },
+    {
+      "clip_ratio/high_max": 4.431366960488958e-06,
+      "clip_ratio/high_mean": 1.1078417401222396e-06,
+      "clip_ratio/low_mean": 4.433405501913512e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.54418968729442e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 14674.0,
+      "completions/max_terminated_length": 14674.0,
+      "completions/mean_length": 5449.2890625,
+      "completions/mean_terminated_length": 5449.2890625,
+      "completions/min_length": 635.0,
+      "completions/min_terminated_length": 635.0,
+      "entropy": 0.9137986451387405,
+      "epoch": 0.09199632014719411,
+      "frac_reward_zero_std": 0.125,
+      "grad_norm": 0.004843447357416153,
+      "learning_rate": 1e-05,
+      "loss": 0.0166,
+      "num_tokens": 74289607.0,
+      "reward": 0.5,
+      "reward_std": 0.40609243512153625,
+      "rewards/accuracy_reward/mean": 0.5,
+      "rewards/accuracy_reward/std": 0.5019646286964417,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999977707862854,
+      "sampling/importance_sampling_ratio/min": 8.851584993863071e-07,
+      "sampling/sampling_logp_difference/max": 13.937499046325684,
+      "sampling/sampling_logp_difference/mean": 0.018183842301368713,
+      "step": 100
+    },
+    {
+      "clip_ratio/high_max": 8.212076863856055e-06,
+      "clip_ratio/high_mean": 2.0530192159640137e-06,
+      "clip_ratio/low_mean": 3.6279372466196946e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.833239122741361e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 16163.0,
+      "completions/max_terminated_length": 16163.0,
+      "completions/mean_length": 4983.3515625,
+      "completions/mean_terminated_length": 4983.3515625,
+      "completions/min_length": 541.0,
+      "completions/min_terminated_length": 541.0,
+      "entropy": 0.9354705810546875,
+      "epoch": 0.09291628334866606,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0037651765160262585,
+      "learning_rate": 1e-05,
+      "loss": 0.0463,
+      "num_tokens": 74946484.0,
+      "reward": 0.3671875,
+      "reward_std": 0.3090519309043884,
+      "rewards/accuracy_reward/mean": 0.3671875,
+      "rewards/accuracy_reward/std": 0.4839322865009308,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999549984931946,
+      "sampling/importance_sampling_ratio/min": 0.00011593531962716952,
+      "sampling/sampling_logp_difference/max": 9.062478065490723,
+      "sampling/sampling_logp_difference/mean": 0.018207306042313576,
+      "step": 101
+    },
+    {
+      "clip_ratio/high_max": 1.3182888324081432e-05,
+      "clip_ratio/high_mean": 3.295722081020358e-06,
+      "clip_ratio/low_mean": 2.544108633628639e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.8736808644680423e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16039.0,
+      "completions/mean_length": 6351.1015625,
+      "completions/mean_terminated_length": 6027.45947265625,
+      "completions/min_length": 804.0,
+      "completions/min_terminated_length": 804.0,
+      "entropy": 0.9310042560100555,
+      "epoch": 0.09383624655013799,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0009160125628113747,
+      "learning_rate": 1e-05,
+      "loss": -0.023,
+      "num_tokens": 75779145.0,
+      "reward": 0.3828125,
+      "reward_std": 0.24329257011413574,
+      "rewards/accuracy_reward/mean": 0.3828125,
+      "rewards/accuracy_reward/std": 0.4879830479621887,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998877048492432,
+      "sampling/importance_sampling_ratio/min": 0.0002961359277833253,
+      "sampling/sampling_logp_difference/max": 8.1246919631958,
+      "sampling/sampling_logp_difference/mean": 0.018513178452849388,
+      "step": 102
+    },
+    {
+      "clip_ratio/high_max": 1.1402620202716207e-05,
+      "clip_ratio/high_mean": 3.935649147024378e-06,
+      "clip_ratio/low_mean": 3.059757568735222e-05,
+      "clip_ratio/low_min": 4.3258582991256844e-06,
+      "clip_ratio/region_mean": 3.45332257438713e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14471.0,
+      "completions/mean_length": 5293.40625,
+      "completions/mean_terminated_length": 4935.64501953125,
+      "completions/min_length": 222.0,
+      "completions/min_terminated_length": 222.0,
+      "entropy": 1.0732879787683487,
+      "epoch": 0.09475620975160993,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0023993055801838636,
+      "learning_rate": 1e-05,
+      "loss": 0.1021,
+      "num_tokens": 76475557.0,
+      "reward": 0.34375,
+      "reward_std": 0.2835350036621094,
+      "rewards/accuracy_reward/mean": 0.34375,
+      "rewards/accuracy_reward/std": 0.47682511806488037,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.000077724456787,
+      "sampling/importance_sampling_ratio/min": 6.613240111619234e-05,
+      "sampling/sampling_logp_difference/max": 9.623851776123047,
+      "sampling/sampling_logp_difference/mean": 0.020792219787836075,
+      "step": 103
+    },
+    {
+      "clip_ratio/high_max": 2.130644793396641e-05,
+      "clip_ratio/high_mean": 8.929533635182452e-06,
+      "clip_ratio/low_mean": 2.663600798769039e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.556554071337814e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16305.0,
+      "completions/mean_length": 7619.7578125,
+      "completions/mean_terminated_length": 7409.41650390625,
+      "completions/min_length": 25.0,
+      "completions/min_terminated_length": 25.0,
+      "entropy": 0.9646238535642624,
+      "epoch": 0.09567617295308188,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0014872358879074454,
+      "learning_rate": 1e-05,
+      "loss": 0.0439,
+      "num_tokens": 77474310.0,
+      "reward": 0.34375,
+      "reward_std": 0.33114904165267944,
+      "rewards/accuracy_reward/mean": 0.34375,
+      "rewards/accuracy_reward/std": 0.47682511806488037,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999638795852661,
+      "sampling/importance_sampling_ratio/min": 0.0016686831368133426,
+      "sampling/sampling_logp_difference/max": 6.395720481872559,
+      "sampling/sampling_logp_difference/mean": 0.020074717700481415,
+      "step": 104
+    },
+    {
+      "clip_ratio/high_max": 1.7765815300663235e-05,
+      "clip_ratio/high_mean": 5.154013138053415e-06,
+      "clip_ratio/low_mean": 5.166909659237717e-05,
+      "clip_ratio/low_min": 8.365680514543783e-06,
+      "clip_ratio/region_mean": 5.68231100714911e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15984.0,
+      "completions/max_terminated_length": 15984.0,
+      "completions/mean_length": 5959.921875,
+      "completions/mean_terminated_length": 5959.921875,
+      "completions/min_length": 55.0,
+      "completions/min_terminated_length": 55.0,
+      "entropy": 1.004471093416214,
+      "epoch": 0.09659613615455381,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.00398358516395092,
+      "learning_rate": 1e-05,
+      "loss": 0.1016,
+      "num_tokens": 78257132.0,
+      "reward": 0.359375,
+      "reward_std": 0.3653082847595215,
+      "rewards/accuracy_reward/mean": 0.359375,
+      "rewards/accuracy_reward/std": 0.481702595949173,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000170469284058,
+      "sampling/importance_sampling_ratio/min": 0.0030075267422944307,
+      "sampling/sampling_logp_difference/max": 5.806637287139893,
+      "sampling/sampling_logp_difference/mean": 0.020755283534526825,
+      "step": 105
+    },
+    {
+      "clip_ratio/high_max": 1.6946955838648137e-05,
+      "clip_ratio/high_mean": 4.236738959662034e-06,
+      "clip_ratio/low_mean": 4.510891039899434e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.934564867653535e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 13736.0,
+      "completions/mean_length": 5427.03125,
+      "completions/mean_terminated_length": 5340.755859375,
+      "completions/min_length": 556.0,
+      "completions/min_terminated_length": 556.0,
+      "entropy": 0.9117375314235687,
+      "epoch": 0.09751609935602576,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0019883522763848305,
+      "learning_rate": 1e-05,
+      "loss": 0.01,
+      "num_tokens": 78971072.0,
+      "reward": 0.375,
+      "reward_std": 0.31694266200065613,
+      "rewards/accuracy_reward/mean": 0.375,
+      "rewards/accuracy_reward/std": 0.4860251843929291,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000550746917725,
+      "sampling/importance_sampling_ratio/min": 0.0008046010043472052,
+      "sampling/sampling_logp_difference/max": 7.125164031982422,
+      "sampling/sampling_logp_difference/mean": 0.018812140449881554,
+      "step": 106
+    },
+    {
+      "clip_ratio/high_max": 2.968176841022796e-05,
+      "clip_ratio/high_mean": 7.42044210255699e-06,
+      "clip_ratio/low_mean": 3.220799408154562e-05,
+      "clip_ratio/low_min": 5.315981979947537e-06,
+      "clip_ratio/region_mean": 3.962843629778945e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 16293.0,
+      "completions/max_terminated_length": 16293.0,
+      "completions/mean_length": 6062.078125,
+      "completions/mean_terminated_length": 6062.078125,
+      "completions/min_length": 108.0,
+      "completions/min_terminated_length": 108.0,
+      "entropy": 1.0164100378751755,
+      "epoch": 0.0984360625574977,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.00450351694598794,
+      "learning_rate": 1e-05,
+      "loss": 0.0426,
+      "num_tokens": 79764434.0,
+      "reward": 0.2578125,
+      "reward_std": 0.26355957984924316,
+      "rewards/accuracy_reward/mean": 0.2578125,
+      "rewards/accuracy_reward/std": 0.43914902210235596,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999713897705078,
+      "sampling/importance_sampling_ratio/min": 0.0007411236292682588,
+      "sampling/sampling_logp_difference/max": 7.207343101501465,
+      "sampling/sampling_logp_difference/mean": 0.020526543259620667,
+      "step": 107
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 4.856050622947805e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.856050622947805e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 13689.0,
+      "completions/max_terminated_length": 13689.0,
+      "completions/mean_length": 4856.53125,
+      "completions/mean_terminated_length": 4856.53125,
+      "completions/min_length": 191.0,
+      "completions/min_terminated_length": 191.0,
+      "entropy": 1.0780886858701706,
+      "epoch": 0.09935602575896964,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.0033157530706375837,
+      "learning_rate": 1e-05,
+      "loss": 0.046,
+      "num_tokens": 80405238.0,
+      "reward": 0.3359375,
+      "reward_std": 0.3487703502178192,
+      "rewards/accuracy_reward/mean": 0.3359375,
+      "rewards/accuracy_reward/std": 0.47417303919792175,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999889135360718,
+      "sampling/importance_sampling_ratio/min": 0.033773623406887054,
+      "sampling/sampling_logp_difference/max": 3.7256407737731934,
+      "sampling/sampling_logp_difference/mean": 0.019188418984413147,
+      "step": 108
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 1.975351790406421e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 1.975351790406421e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 16335.0,
+      "completions/max_terminated_length": 16335.0,
+      "completions/mean_length": 3930.5859375,
+      "completions/mean_terminated_length": 3930.5859375,
+      "completions/min_length": 4.0,
+      "completions/min_terminated_length": 4.0,
+      "entropy": 0.8666863515973091,
+      "epoch": 0.10027598896044158,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.005471619311720133,
+      "learning_rate": 1e-05,
+      "loss": -0.0779,
+      "num_tokens": 80926721.0,
+      "reward": 0.5859375,
+      "reward_std": 0.3164186179637909,
+      "rewards/accuracy_reward/mean": 0.5859375,
+      "rewards/accuracy_reward/std": 0.49449479579925537,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000040531158447,
+      "sampling/importance_sampling_ratio/min": 0.0002562212466727942,
+      "sampling/sampling_logp_difference/max": 8.269469261169434,
+      "sampling/sampling_logp_difference/mean": 0.017708823084831238,
+      "step": 109
+    },
+    {
+      "clip_ratio/high_max": 6.743997801095247e-06,
+      "clip_ratio/high_mean": 1.6859994502738118e-06,
+      "clip_ratio/low_mean": 3.61007656692891e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.7786765119562915e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15546.0,
+      "completions/mean_length": 5934.9453125,
+      "completions/mean_terminated_length": 5684.16845703125,
+      "completions/min_length": 229.0,
+      "completions/min_terminated_length": 229.0,
+      "entropy": 0.9991667941212654,
+      "epoch": 0.10119595216191353,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.002580739092081785,
+      "learning_rate": 1e-05,
+      "loss": -0.0065,
+      "num_tokens": 81707978.0,
+      "reward": 0.3046875,
+      "reward_std": 0.24671243131160736,
+      "rewards/accuracy_reward/mean": 0.3046875,
+      "rewards/accuracy_reward/std": 0.46208351850509644,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000852346420288,
+      "sampling/importance_sampling_ratio/min": 0.002478762762621045,
+      "sampling/sampling_logp_difference/max": 5.999995708465576,
+      "sampling/sampling_logp_difference/mean": 0.019801246002316475,
+      "step": 110
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 2.43532002741631e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.43532002741631e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16010.0,
+      "completions/mean_length": 5866.84375,
+      "completions/mean_terminated_length": 5699.9052734375,
+      "completions/min_length": 499.0,
+      "completions/min_terminated_length": 499.0,
+      "entropy": 0.9848997294902802,
+      "epoch": 0.10211591536338546,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0010949905263260007,
+      "learning_rate": 1e-05,
+      "loss": 0.0266,
+      "num_tokens": 82477310.0,
+      "reward": 0.2734375,
+      "reward_std": 0.26933354139328003,
+      "rewards/accuracy_reward/mean": 0.2734375,
+      "rewards/accuracy_reward/std": 0.447474867105484,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999667406082153,
+      "sampling/importance_sampling_ratio/min": 9.04304688447155e-05,
+      "sampling/sampling_logp_difference/max": 9.310929298400879,
+      "sampling/sampling_logp_difference/mean": 0.020769795402884483,
+      "step": 111
+    },
+    {
+      "clip_ratio/high_max": 1.9307613456476247e-05,
+      "clip_ratio/high_mean": 4.826903364119062e-06,
+      "clip_ratio/low_mean": 5.842190330440644e-05,
+      "clip_ratio/low_min": 1.2287753634154797e-05,
+      "clip_ratio/region_mean": 6.324880496322294e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 14501.0,
+      "completions/max_terminated_length": 14501.0,
+      "completions/mean_length": 6613.7578125,
+      "completions/mean_terminated_length": 6613.7578125,
+      "completions/min_length": 1033.0,
+      "completions/min_terminated_length": 1033.0,
+      "entropy": 0.9176012054085732,
+      "epoch": 0.10303587856485741,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0020384234376251698,
+      "learning_rate": 1e-05,
+      "loss": 0.0571,
+      "num_tokens": 83345055.0,
+      "reward": 0.3671875,
+      "reward_std": 0.2845958471298218,
+      "rewards/accuracy_reward/mean": 0.3671875,
+      "rewards/accuracy_reward/std": 0.4839322865009308,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999457001686096,
+      "sampling/importance_sampling_ratio/min": 0.029541675001382828,
+      "sampling/sampling_logp_difference/max": 3.5219533443450928,
+      "sampling/sampling_logp_difference/mean": 0.018883168697357178,
+      "step": 112
+    },
+    {
+      "clip_ratio/high_max": 1.382043183184578e-05,
+      "clip_ratio/high_mean": 3.455107957961445e-06,
+      "clip_ratio/low_mean": 5.789885449303256e-05,
+      "clip_ratio/low_min": 1.017130716718384e-05,
+      "clip_ratio/region_mean": 6.135396188255982e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16310.0,
+      "completions/mean_length": 6392.3125,
+      "completions/mean_terminated_length": 6070.0,
+      "completions/min_length": 507.0,
+      "completions/min_terminated_length": 507.0,
+      "entropy": 0.904954232275486,
+      "epoch": 0.10395584176632934,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0031166900880634785,
+      "learning_rate": 1e-05,
+      "loss": 0.0351,
+      "num_tokens": 84186343.0,
+      "reward": 0.390625,
+      "reward_std": 0.34139877557754517,
+      "rewards/accuracy_reward/mean": 0.390625,
+      "rewards/accuracy_reward/std": 0.4898075461387634,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999208450317383,
+      "sampling/importance_sampling_ratio/min": 0.00022529886336997151,
+      "sampling/sampling_logp_difference/max": 8.398082733154297,
+      "sampling/sampling_logp_difference/mean": 0.01931958645582199,
+      "step": 113
+    },
+    {
+      "clip_ratio/high_max": 1.7221671441802755e-05,
+      "clip_ratio/high_mean": 6.549099907715572e-06,
+      "clip_ratio/low_mean": 3.147818074467068e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.802728065238625e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16180.0,
+      "completions/mean_length": 5982.703125,
+      "completions/mean_terminated_length": 5817.603515625,
+      "completions/min_length": 294.0,
+      "completions/min_terminated_length": 294.0,
+      "entropy": 0.8394555225968361,
+      "epoch": 0.10487580496780129,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0022041688207536936,
+      "learning_rate": 1e-05,
+      "loss": 0.1043,
+      "num_tokens": 84971129.0,
+      "reward": 0.3125,
+      "reward_std": 0.30774885416030884,
+      "rewards/accuracy_reward/mean": 0.3125,
+      "rewards/accuracy_reward/std": 0.4653336703777313,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999030828475952,
+      "sampling/importance_sampling_ratio/min": 1.553593506287143e-06,
+      "sampling/sampling_logp_difference/max": 13.374939918518066,
+      "sampling/sampling_logp_difference/mean": 0.01795877143740654,
+      "step": 114
+    },
+    {
+      "clip_ratio/high_max": 2.9651660042873118e-05,
+      "clip_ratio/high_mean": 9.398806923854863e-06,
+      "clip_ratio/low_mean": 4.788733849636628e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.728614519284747e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14988.0,
+      "completions/mean_length": 4976.921875,
+      "completions/mean_terminated_length": 4608.95166015625,
+      "completions/min_length": 335.0,
+      "completions/min_terminated_length": 335.0,
+      "entropy": 0.8381234556436539,
+      "epoch": 0.10579576816927323,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.0037972736172378063,
+      "learning_rate": 1e-05,
+      "loss": 0.1244,
+      "num_tokens": 85625559.0,
+      "reward": 0.4765625,
+      "reward_std": 0.39082521200180054,
+      "rewards/accuracy_reward/mean": 0.4765625,
+      "rewards/accuracy_reward/std": 0.5014128684997559,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999970555305481,
+      "sampling/importance_sampling_ratio/min": 0.002990707289427519,
+      "sampling/sampling_logp_difference/max": 5.8122453689575195,
+      "sampling/sampling_logp_difference/mean": 0.01815030723810196,
+      "step": 115
+    },
+    {
+      "clip_ratio/high_max": 4.130592969886493e-06,
+      "clip_ratio/high_mean": 1.0326482424716232e-06,
+      "clip_ratio/low_mean": 1.6904315600640984e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 1.7936963843112608e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15984.0,
+      "completions/mean_length": 6307.2421875,
+      "completions/mean_terminated_length": 6065.400390625,
+      "completions/min_length": 823.0,
+      "completions/min_terminated_length": 823.0,
+      "entropy": 1.1176434755325317,
+      "epoch": 0.10671573137074516,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0012413962977007031,
+      "learning_rate": 1e-05,
+      "loss": 0.0146,
+      "num_tokens": 86453606.0,
+      "reward": 0.28125,
+      "reward_std": 0.2280253767967224,
+      "rewards/accuracy_reward/mean": 0.28125,
+      "rewards/accuracy_reward/std": 0.4513758420944214,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000064373016357,
+      "sampling/importance_sampling_ratio/min": 0.004730688873678446,
+      "sampling/sampling_logp_difference/max": 5.353684425354004,
+      "sampling/sampling_logp_difference/mean": 0.021790307015180588,
+      "step": 116
+    },
+    {
+      "clip_ratio/high_max": 1.3160772823539446e-05,
+      "clip_ratio/high_mean": 3.2901932058848615e-06,
+      "clip_ratio/low_mean": 3.582628983167524e-05,
+      "clip_ratio/low_min": 2.61966624748311e-06,
+      "clip_ratio/region_mean": 3.911648195753514e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16329.0,
+      "completions/mean_length": 7263.1640625,
+      "completions/mean_terminated_length": 7044.26416015625,
+      "completions/min_length": 48.0,
+      "completions/min_terminated_length": 48.0,
+      "entropy": 1.107876107096672,
+      "epoch": 0.10763569457221711,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0017762042116373777,
+      "learning_rate": 1e-05,
+      "loss": 0.0349,
+      "num_tokens": 87402763.0,
+      "reward": 0.2578125,
+      "reward_std": 0.27776598930358887,
+      "rewards/accuracy_reward/mean": 0.2578125,
+      "rewards/accuracy_reward/std": 0.43914902210235596,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999741315841675,
+      "sampling/importance_sampling_ratio/min": 0.0009408573969267309,
+      "sampling/sampling_logp_difference/max": 6.968719005584717,
+      "sampling/sampling_logp_difference/mean": 0.02103034406900406,
+      "step": 117
+    },
+    {
+      "clip_ratio/high_max": 3.987745776612428e-05,
+      "clip_ratio/high_mean": 1.1877163728968299e-05,
+      "clip_ratio/low_mean": 4.26799579145154e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.455712096136267e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15416.0,
+      "completions/mean_length": 5093.859375,
+      "completions/mean_terminated_length": 4914.65087890625,
+      "completions/min_length": 364.0,
+      "completions/min_terminated_length": 364.0,
+      "entropy": 1.1065888702869415,
+      "epoch": 0.10855565777368906,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0032127038575708866,
+      "learning_rate": 1e-05,
+      "loss": 0.0194,
+      "num_tokens": 88077385.0,
+      "reward": 0.421875,
+      "reward_std": 0.345874547958374,
+      "rewards/accuracy_reward/mean": 0.421875,
+      "rewards/accuracy_reward/std": 0.4957992732524872,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999171495437622,
+      "sampling/importance_sampling_ratio/min": 7.033879228401929e-05,
+      "sampling/sampling_logp_difference/max": 9.562187194824219,
+      "sampling/sampling_logp_difference/mean": 0.020314980298280716,
+      "step": 118
+    },
+    {
+      "clip_ratio/high_max": 9.35208754526684e-06,
+      "clip_ratio/high_mean": 4.4788730519940145e-06,
+      "clip_ratio/low_mean": 3.470697703278347e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.918584917528278e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15740.0,
+      "completions/mean_length": 6943.53125,
+      "completions/mean_terminated_length": 6639.0,
+      "completions/min_length": 307.0,
+      "completions/min_terminated_length": 307.0,
+      "entropy": 0.9009081721305847,
+      "epoch": 0.10947562097516099,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0028925195802003145,
+      "learning_rate": 1e-05,
+      "loss": 0.0862,
+      "num_tokens": 88985269.0,
+      "reward": 0.3984375,
+      "reward_std": 0.3535328209400177,
+      "rewards/accuracy_reward/mean": 0.3984375,
+      "rewards/accuracy_reward/std": 0.4915000796318054,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999980628490448,
+      "sampling/importance_sampling_ratio/min": 6.553035092338177e-08,
+      "sampling/sampling_logp_difference/max": 16.540752410888672,
+      "sampling/sampling_logp_difference/mean": 0.019378282129764557,
+      "step": 119
+    },
+    {
+      "clip_ratio/high_max": 1.0939961612166371e-05,
+      "clip_ratio/high_mean": 2.734990403041593e-06,
+      "clip_ratio/low_mean": 2.4615862798782473e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.7350853201824066e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15148.0,
+      "completions/max_terminated_length": 15148.0,
+      "completions/mean_length": 4976.25,
+      "completions/mean_terminated_length": 4976.25,
+      "completions/min_length": 702.0,
+      "completions/min_terminated_length": 702.0,
+      "entropy": 0.9463540017604828,
+      "epoch": 0.11039558417663294,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0017386430408805609,
+      "learning_rate": 1e-05,
+      "loss": 0.0215,
+      "num_tokens": 89645205.0,
+      "reward": 0.359375,
+      "reward_std": 0.26462042331695557,
+      "rewards/accuracy_reward/mean": 0.359375,
+      "rewards/accuracy_reward/std": 0.481702595949173,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999554753303528,
+      "sampling/importance_sampling_ratio/min": 7.889595508459024e-06,
+      "sampling/sampling_logp_difference/max": 11.74996566772461,
+      "sampling/sampling_logp_difference/mean": 0.018035830929875374,
+      "step": 120
+    },
+    {
+      "clip_ratio/high_max": 5.941629297012696e-06,
+      "clip_ratio/high_mean": 1.485407324253174e-06,
+      "clip_ratio/low_mean": 2.6826061798601586e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.8311469009167922e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15892.0,
+      "completions/mean_length": 6439.5390625,
+      "completions/mean_terminated_length": 6281.69091796875,
+      "completions/min_length": 959.0,
+      "completions/min_terminated_length": 959.0,
+      "entropy": 0.899876207113266,
+      "epoch": 0.11131554737810488,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0037381781730800867,
+      "learning_rate": 1e-05,
+      "loss": 0.0666,
+      "num_tokens": 90489394.0,
+      "reward": 0.3203125,
+      "reward_std": 0.2624938488006592,
+      "rewards/accuracy_reward/mean": 0.3203125,
+      "rewards/accuracy_reward/std": 0.4684300124645233,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999206066131592,
+      "sampling/importance_sampling_ratio/min": 0.003606764366850257,
+      "sampling/sampling_logp_difference/max": 5.62494421005249,
+      "sampling/sampling_logp_difference/mean": 0.019368179142475128,
+      "step": 121
+    },
+    {
+      "clip_ratio/high_max": 5.189952389628161e-06,
+      "clip_ratio/high_mean": 1.2974880974070402e-06,
+      "clip_ratio/low_mean": 3.058137212974543e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.187886022715247e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15979.0,
+      "completions/mean_length": 6876.46875,
+      "completions/mean_terminated_length": 6408.884765625,
+      "completions/min_length": 4.0,
+      "completions/min_terminated_length": 4.0,
+      "entropy": 1.1018569767475128,
+      "epoch": 0.11223551057957681,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0018562980694696307,
+      "learning_rate": 1e-05,
+      "loss": 0.095,
+      "num_tokens": 91390054.0,
+      "reward": 0.21875,
+      "reward_std": 0.29955869913101196,
+      "rewards/accuracy_reward/mean": 0.21875,
+      "rewards/accuracy_reward/std": 0.41502299904823303,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999849796295166,
+      "sampling/importance_sampling_ratio/min": 2.9343695132411085e-05,
+      "sampling/sampling_logp_difference/max": 10.436432838439941,
+      "sampling/sampling_logp_difference/mean": 0.020825792104005814,
+      "step": 122
+    },
+    {
+      "clip_ratio/high_max": 2.022083435804234e-05,
+      "clip_ratio/high_mean": 5.055208589510585e-06,
+      "clip_ratio/low_mean": 3.029032552603894e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.53455343429232e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14153.0,
+      "completions/mean_length": 6501.5078125,
+      "completions/mean_terminated_length": 6344.64306640625,
+      "completions/min_length": 720.0,
+      "completions/min_terminated_length": 720.0,
+      "entropy": 1.073579266667366,
+      "epoch": 0.11315547378104876,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0016695430967956781,
+      "learning_rate": 1e-05,
+      "loss": 0.0552,
+      "num_tokens": 92241535.0,
+      "reward": 0.2734375,
+      "reward_std": 0.28641316294670105,
+      "rewards/accuracy_reward/mean": 0.2734375,
+      "rewards/accuracy_reward/std": 0.447474867105484,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998984336853027,
+      "sampling/importance_sampling_ratio/min": 0.0002380236255703494,
+      "sampling/sampling_logp_difference/max": 8.343140602111816,
+      "sampling/sampling_logp_difference/mean": 0.020438479259610176,
+      "step": 123
+    },
+    {
+      "clip_ratio/high_max": 3.3911180707946187e-06,
+      "clip_ratio/high_mean": 8.477795176986547e-07,
+      "clip_ratio/low_mean": 2.2190370486896427e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.30381500614385e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 14345.0,
+      "completions/max_terminated_length": 14345.0,
+      "completions/mean_length": 5474.1328125,
+      "completions/mean_terminated_length": 5474.1328125,
+      "completions/min_length": 14.0,
+      "completions/min_terminated_length": 14.0,
+      "entropy": 1.0692576617002487,
+      "epoch": 0.1140754369825207,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0034909825772047043,
+      "learning_rate": 1e-05,
+      "loss": 0.0,
+      "num_tokens": 92962472.0,
+      "reward": 0.3046875,
+      "reward_std": 0.27564430236816406,
+      "rewards/accuracy_reward/mean": 0.3046875,
+      "rewards/accuracy_reward/std": 0.46208351850509644,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.000006079673767,
+      "sampling/importance_sampling_ratio/min": 0.0017851731972768903,
+      "sampling/sampling_logp_difference/max": 6.328239917755127,
+      "sampling/sampling_logp_difference/mean": 0.019930578768253326,
+      "step": 124
+    },
+    {
+      "clip_ratio/high_max": 2.6292200345778838e-05,
+      "clip_ratio/high_mean": 7.620442374900449e-06,
+      "clip_ratio/low_mean": 4.615546390596137e-05,
+      "clip_ratio/low_min": 1.366510537081922e-05,
+      "clip_ratio/region_mean": 5.3775906508235494e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16198.0,
+      "completions/mean_length": 7512.078125,
+      "completions/mean_terminated_length": 7225.88671875,
+      "completions/min_length": 486.0,
+      "completions/min_terminated_length": 486.0,
+      "entropy": 0.9676955863833427,
+      "epoch": 0.11499540018399264,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0023449272848665714,
+      "learning_rate": 1e-05,
+      "loss": 0.0454,
+      "num_tokens": 93950506.0,
+      "reward": 0.3203125,
+      "reward_std": 0.22461043298244476,
+      "rewards/accuracy_reward/mean": 0.3203125,
+      "rewards/accuracy_reward/std": 0.4684300124645233,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999359250068665,
+      "sampling/importance_sampling_ratio/min": 0.0016406332142651081,
+      "sampling/sampling_logp_difference/max": 6.412672996520996,
+      "sampling/sampling_logp_difference/mean": 0.020141655579209328,
+      "step": 125
+    },
+    {
+      "clip_ratio/high_max": 5.097255780128762e-06,
+      "clip_ratio/high_mean": 1.2743139450321905e-06,
+      "clip_ratio/low_mean": 3.3802551342887455e-05,
+      "clip_ratio/low_min": 4.146762421441963e-06,
+      "clip_ratio/region_mean": 3.5076865287919645e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16183.0,
+      "completions/mean_length": 6920.484375,
+      "completions/mean_terminated_length": 6693.3603515625,
+      "completions/min_length": 962.0,
+      "completions/min_terminated_length": 962.0,
+      "entropy": 0.8662540689110756,
+      "epoch": 0.11591536338546458,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0037103090435266495,
+      "learning_rate": 1e-05,
+      "loss": 0.0617,
+      "num_tokens": 94854016.0,
+      "reward": 0.4375,
+      "reward_std": 0.322716623544693,
+      "rewards/accuracy_reward/mean": 0.4375,
+      "rewards/accuracy_reward/std": 0.49802759289741516,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999761581420898,
+      "sampling/importance_sampling_ratio/min": 0.00047686786274425685,
+      "sampling/sampling_logp_difference/max": 7.648271083831787,
+      "sampling/sampling_logp_difference/mean": 0.01915796287357807,
+      "step": 126
+    },
+    {
+      "clip_ratio/high_max": 8.4922439782531e-06,
+      "clip_ratio/high_mean": 2.123060994563275e-06,
+      "clip_ratio/low_mean": 5.024227584726759e-05,
+      "clip_ratio/low_min": 1.3627016414829995e-05,
+      "clip_ratio/region_mean": 5.236533706920454e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15867.0,
+      "completions/mean_length": 7939.609375,
+      "completions/mean_terminated_length": 7805.57177734375,
+      "completions/min_length": 1260.0,
+      "completions/min_terminated_length": 1260.0,
+      "entropy": 0.9707008600234985,
+      "epoch": 0.11683532658693652,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0024642283096909523,
+      "learning_rate": 1e-05,
+      "loss": 0.0788,
+      "num_tokens": 95889966.0,
+      "reward": 0.2265625,
+      "reward_std": 0.27434611320495605,
+      "rewards/accuracy_reward/mean": 0.2265625,
+      "rewards/accuracy_reward/std": 0.4202519655227661,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998771548271179,
+      "sampling/importance_sampling_ratio/min": 4.540014560916461e-05,
+      "sampling/sampling_logp_difference/max": 9.999995231628418,
+      "sampling/sampling_logp_difference/mean": 0.020453302189707756,
+      "step": 127
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 3.766829564710861e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.766829564710861e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14969.0,
+      "completions/mean_length": 5985.8203125,
+      "completions/mean_terminated_length": 5474.43408203125,
+      "completions/min_length": 315.0,
+      "completions/min_terminated_length": 315.0,
+      "entropy": 0.9083090648055077,
+      "epoch": 0.11775528978840846,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.003317479742690921,
+      "learning_rate": 1e-05,
+      "loss": 0.0537,
+      "num_tokens": 96676847.0,
+      "reward": 0.3671875,
+      "reward_std": 0.287486732006073,
+      "rewards/accuracy_reward/mean": 0.3671875,
+      "rewards/accuracy_reward/std": 0.4839322865009308,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999130964279175,
+      "sampling/importance_sampling_ratio/min": 0.000286750087980181,
+      "sampling/sampling_logp_difference/max": 8.156899452209473,
+      "sampling/sampling_logp_difference/mean": 0.01996719278395176,
+      "step": 128
+    },
+    {
+      "clip_ratio/high_max": 1.8439853647578275e-05,
+      "clip_ratio/high_mean": 4.609963411894569e-06,
+      "clip_ratio/low_mean": 5.708034223061986e-05,
+      "clip_ratio/low_min": 2.75287948170444e-06,
+      "clip_ratio/region_mean": 6.169030598357494e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15081.0,
+      "completions/mean_length": 6565.359375,
+      "completions/mean_terminated_length": 6488.04736328125,
+      "completions/min_length": 248.0,
+      "completions/min_terminated_length": 248.0,
+      "entropy": 1.1013468354940414,
+      "epoch": 0.11867525298988041,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0019073591101914644,
+      "learning_rate": 1e-05,
+      "loss": 0.0622,
+      "num_tokens": 97539453.0,
+      "reward": 0.2734375,
+      "reward_std": 0.307217001914978,
+      "rewards/accuracy_reward/mean": 0.2734375,
+      "rewards/accuracy_reward/std": 0.447474867105484,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999555945396423,
+      "sampling/importance_sampling_ratio/min": 0.0006022047018632293,
+      "sampling/sampling_logp_difference/max": 7.414913177490234,
+      "sampling/sampling_logp_difference/mean": 0.02150837704539299,
+      "step": 129
+    },
+    {
+      "clip_ratio/high_max": 9.068485269381199e-06,
+      "clip_ratio/high_mean": 2.2671213173452998e-06,
+      "clip_ratio/low_mean": 1.9822365402433206e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.208948649240483e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16099.0,
+      "completions/mean_length": 6779.6171875,
+      "completions/mean_terminated_length": 6703.9921875,
+      "completions/min_length": 4.0,
+      "completions/min_terminated_length": 4.0,
+      "entropy": 0.8940552547574043,
+      "epoch": 0.11959521619135234,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0010163087863475084,
+      "learning_rate": 1e-05,
+      "loss": 0.0249,
+      "num_tokens": 98429036.0,
+      "reward": 0.453125,
+      "reward_std": 0.22567126154899597,
+      "rewards/accuracy_reward/mean": 0.453125,
+      "rewards/accuracy_reward/std": 0.4997538626194,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999485015869141,
+      "sampling/importance_sampling_ratio/min": 3.464699460664633e-08,
+      "sampling/sampling_logp_difference/max": 17.178054809570312,
+      "sampling/sampling_logp_difference/mean": 0.018716152757406235,
+      "step": 130
+    },
+    {
+      "clip_ratio/high_max": 5.047242211730918e-06,
+      "clip_ratio/high_mean": 1.2618105529327295e-06,
+      "clip_ratio/low_mean": 2.9014110396019532e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.0275920835265424e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 14549.0,
+      "completions/max_terminated_length": 14549.0,
+      "completions/mean_length": 5766.71875,
+      "completions/mean_terminated_length": 5766.71875,
+      "completions/min_length": 47.0,
+      "completions/min_terminated_length": 47.0,
+      "entropy": 1.0455922111868858,
+      "epoch": 0.12051517939282429,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.002155766822397709,
+      "learning_rate": 1e-05,
+      "loss": 0.0238,
+      "num_tokens": 99184264.0,
+      "reward": 0.4140625,
+      "reward_std": 0.3077537715435028,
+      "rewards/accuracy_reward/mean": 0.4140625,
+      "rewards/accuracy_reward/std": 0.49449479579925537,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999253749847412,
+      "sampling/importance_sampling_ratio/min": 0.00010798005678225309,
+      "sampling/sampling_logp_difference/max": 9.133563995361328,
+      "sampling/sampling_logp_difference/mean": 0.020948775112628937,
+      "step": 131
+    },
+    {
+      "clip_ratio/high_max": 2.0882574972347356e-05,
+      "clip_ratio/high_mean": 6.505383225885453e-06,
+      "clip_ratio/low_mean": 4.496008500609605e-05,
+      "clip_ratio/low_min": 7.757854064038838e-06,
+      "clip_ratio/region_mean": 5.1465468231981504e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14704.0,
+      "completions/mean_length": 6167.2421875,
+      "completions/mean_terminated_length": 6005.07177734375,
+      "completions/min_length": 218.0,
+      "completions/min_terminated_length": 218.0,
+      "entropy": 0.9100174158811569,
+      "epoch": 0.12143514259429623,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.0021464223973453045,
+      "learning_rate": 1e-05,
+      "loss": -0.0279,
+      "num_tokens": 99996831.0,
+      "reward": 0.421875,
+      "reward_std": 0.3916535973548889,
+      "rewards/accuracy_reward/mean": 0.421875,
+      "rewards/accuracy_reward/std": 0.4957992732524872,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999240040779114,
+      "sampling/importance_sampling_ratio/min": 0.02249590866267681,
+      "sampling/sampling_logp_difference/max": 3.794421911239624,
+      "sampling/sampling_logp_difference/mean": 0.01866895705461502,
+      "step": 132
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 3.0998018473837874e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.0998018473837874e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15738.0,
+      "completions/mean_length": 6242.9453125,
+      "completions/mean_terminated_length": 6163.09423828125,
+      "completions/min_length": 1187.0,
+      "completions/min_terminated_length": 1187.0,
+      "entropy": 0.8624134212732315,
+      "epoch": 0.12235510579576817,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0023277695290744305,
+      "learning_rate": 1e-05,
+      "loss": 0.0524,
+      "num_tokens": 100814112.0,
+      "reward": 0.3984375,
+      "reward_std": 0.2869499623775482,
+      "rewards/accuracy_reward/mean": 0.3984375,
+      "rewards/accuracy_reward/std": 0.4915000796318054,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999959409236908,
+      "sampling/importance_sampling_ratio/min": 0.0002393616596236825,
+      "sampling/sampling_logp_difference/max": 8.33753490447998,
+      "sampling/sampling_logp_difference/mean": 0.0191188994795084,
+      "step": 133
+    },
+    {
+      "clip_ratio/high_max": 6.589872555196052e-06,
+      "clip_ratio/high_mean": 1.647468138799013e-06,
+      "clip_ratio/low_mean": 4.329304238126497e-05,
+      "clip_ratio/low_min": 3.5120251595799346e-06,
+      "clip_ratio/region_mean": 4.494051017900347e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14866.0,
+      "completions/mean_length": 5733.6875,
+      "completions/mean_terminated_length": 5478.080078125,
+      "completions/min_length": 789.0,
+      "completions/min_terminated_length": 789.0,
+      "entropy": 0.9628067463636398,
+      "epoch": 0.12327506899724011,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.003547821193933487,
+      "learning_rate": 1e-05,
+      "loss": 0.0321,
+      "num_tokens": 101566264.0,
+      "reward": 0.3984375,
+      "reward_std": 0.36584997177124023,
+      "rewards/accuracy_reward/mean": 0.3984375,
+      "rewards/accuracy_reward/std": 0.4915000796318054,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999400973320007,
+      "sampling/importance_sampling_ratio/min": 0.0001282035664189607,
+      "sampling/sampling_logp_difference/max": 8.961891174316406,
+      "sampling/sampling_logp_difference/mean": 0.019646761938929558,
+      "step": 134
+    },
+    {
+      "clip_ratio/high_max": 1.7107527582993498e-05,
+      "clip_ratio/high_mean": 4.2768818957483745e-06,
+      "clip_ratio/low_mean": 3.014796902789385e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.442485103732906e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15848.0,
+      "completions/max_terminated_length": 15848.0,
+      "completions/mean_length": 5505.9375,
+      "completions/mean_terminated_length": 5505.9375,
+      "completions/min_length": 668.0,
+      "completions/min_terminated_length": 668.0,
+      "entropy": 0.8041045889258385,
+      "epoch": 0.12419503219871206,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0024891747161746025,
+      "learning_rate": 1e-05,
+      "loss": 0.1406,
+      "num_tokens": 102291456.0,
+      "reward": 0.5,
+      "reward_std": 0.35482609272003174,
+      "rewards/accuracy_reward/mean": 0.5,
+      "rewards/accuracy_reward/std": 0.5019646286964417,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999248385429382,
+      "sampling/importance_sampling_ratio/min": 0.0014627616619691253,
+      "sampling/sampling_logp_difference/max": 6.527429103851318,
+      "sampling/sampling_logp_difference/mean": 0.01716250739991665,
+      "step": 135
+    },
+    {
+      "clip_ratio/high_max": 1.548903105685895e-05,
+      "clip_ratio/high_mean": 3.872257764214737e-06,
+      "clip_ratio/low_mean": 5.380711581892683e-05,
+      "clip_ratio/low_min": 4.5777483137499075e-06,
+      "clip_ratio/region_mean": 5.767937363998499e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 16005.0,
+      "completions/max_terminated_length": 16005.0,
+      "completions/mean_length": 5003.0625,
+      "completions/mean_terminated_length": 5003.0625,
+      "completions/min_length": 497.0,
+      "completions/min_terminated_length": 497.0,
+      "entropy": 0.9115714654326439,
+      "epoch": 0.125114995400184,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.00220683915540576,
+      "learning_rate": 1e-05,
+      "loss": 0.1361,
+      "num_tokens": 102949824.0,
+      "reward": 0.4140625,
+      "reward_std": 0.3158818483352661,
+      "rewards/accuracy_reward/mean": 0.4140625,
+      "rewards/accuracy_reward/std": 0.49449479579925537,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999973714351654,
+      "sampling/importance_sampling_ratio/min": 8.323705696966499e-05,
+      "sampling/sampling_logp_difference/max": 9.393817901611328,
+      "sampling/sampling_logp_difference/mean": 0.018076512962579727,
+      "step": 136
+    },
+    {
+      "clip_ratio/high_max": 2.181136096623959e-05,
+      "clip_ratio/high_mean": 5.4528402415598975e-06,
+      "clip_ratio/low_mean": 3.4416837252138066e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.986967681157694e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15658.0,
+      "completions/max_terminated_length": 15658.0,
+      "completions/mean_length": 4742.1328125,
+      "completions/mean_terminated_length": 4742.1328125,
+      "completions/min_length": 462.0,
+      "completions/min_terminated_length": 462.0,
+      "entropy": 0.9430246204137802,
+      "epoch": 0.12603495860165592,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.003964806906878948,
+      "learning_rate": 1e-05,
+      "loss": 0.0215,
+      "num_tokens": 103580913.0,
+      "reward": 0.4609375,
+      "reward_std": 0.2914257347583771,
+      "rewards/accuracy_reward/mean": 0.4609375,
+      "rewards/accuracy_reward/std": 0.5004304051399231,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999952495098114,
+      "sampling/importance_sampling_ratio/min": 7.031940185697749e-05,
+      "sampling/sampling_logp_difference/max": 9.56246280670166,
+      "sampling/sampling_logp_difference/mean": 0.019651200622320175,
+      "step": 137
+    },
+    {
+      "clip_ratio/high_max": 4.07684046876966e-06,
+      "clip_ratio/high_mean": 1.019210117192415e-06,
+      "clip_ratio/low_mean": 3.8682398553646635e-05,
+      "clip_ratio/low_min": 8.189203072106466e-06,
+      "clip_ratio/region_mean": 3.970160832977854e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15944.0,
+      "completions/mean_length": 6574.171875,
+      "completions/mean_terminated_length": 6091.72119140625,
+      "completions/min_length": 10.0,
+      "completions/min_terminated_length": 10.0,
+      "entropy": 0.8429529070854187,
+      "epoch": 0.12695492180312787,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.002067410387098789,
+      "learning_rate": 1e-05,
+      "loss": 0.0377,
+      "num_tokens": 104447463.0,
+      "reward": 0.3125,
+      "reward_std": 0.24511480331420898,
+      "rewards/accuracy_reward/mean": 0.3125,
+      "rewards/accuracy_reward/std": 0.4653336703777313,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9997583627700806,
+      "sampling/importance_sampling_ratio/min": 0.00021258489869069308,
+      "sampling/sampling_logp_difference/max": 8.456169128417969,
+      "sampling/sampling_logp_difference/mean": 0.018853647634387016,
+      "step": 138
+    },
+    {
+      "clip_ratio/high_max": 1.9725823221961036e-05,
+      "clip_ratio/high_mean": 4.931455805490259e-06,
+      "clip_ratio/low_mean": 5.9263072444082354e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 6.419452870431996e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15518.0,
+      "completions/max_terminated_length": 15518.0,
+      "completions/mean_length": 4581.5625,
+      "completions/mean_terminated_length": 4581.5625,
+      "completions/min_length": 301.0,
+      "completions/min_terminated_length": 301.0,
+      "entropy": 0.7094272822141647,
+      "epoch": 0.12787488500459981,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.004292502999305725,
+      "learning_rate": 1e-05,
+      "loss": 0.0946,
+      "num_tokens": 105052287.0,
+      "reward": 0.625,
+      "reward_std": 0.3908300995826721,
+      "rewards/accuracy_reward/mean": 0.625,
+      "rewards/accuracy_reward/std": 0.4860251843929291,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999477863311768,
+      "sampling/importance_sampling_ratio/min": 0.0019342642044648528,
+      "sampling/sampling_logp_difference/max": 6.24802827835083,
+      "sampling/sampling_logp_difference/mean": 0.016310662031173706,
+      "step": 139
+    },
+    {
+      "clip_ratio/high_max": 1.0132298029930098e-05,
+      "clip_ratio/high_mean": 2.5330745074825245e-06,
+      "clip_ratio/low_mean": 4.6397121650443296e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.893019581686531e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16097.0,
+      "completions/mean_length": 7066.4453125,
+      "completions/mean_terminated_length": 6918.5478515625,
+      "completions/min_length": 990.0,
+      "completions/min_terminated_length": 990.0,
+      "entropy": 0.8481669947504997,
+      "epoch": 0.12879484820607176,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0015785128343850374,
+      "learning_rate": 1e-05,
+      "loss": 0.0485,
+      "num_tokens": 105977048.0,
+      "reward": 0.3515625,
+      "reward_std": 0.27328038215637207,
+      "rewards/accuracy_reward/mean": 0.3515625,
+      "rewards/accuracy_reward/std": 0.4793342351913452,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000159740447998,
+      "sampling/importance_sampling_ratio/min": 0.00104097044095397,
+      "sampling/sampling_logp_difference/max": 6.8676018714904785,
+      "sampling/sampling_logp_difference/mean": 0.018304405733942986,
+      "step": 140
+    },
+    {
+      "clip_ratio/high_max": 1.6989023606583942e-05,
+      "clip_ratio/high_mean": 4.2472559016459854e-06,
+      "clip_ratio/low_mean": 2.3075059743860038e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.7322315418132348e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 16104.0,
+      "completions/max_terminated_length": 16104.0,
+      "completions/mean_length": 6230.5234375,
+      "completions/mean_terminated_length": 6230.5234375,
+      "completions/min_length": 220.0,
+      "completions/min_terminated_length": 220.0,
+      "entropy": 0.9658062160015106,
+      "epoch": 0.1297148114075437,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.002542720176279545,
+      "learning_rate": 1e-05,
+      "loss": 0.0725,
+      "num_tokens": 106793187.0,
+      "reward": 0.3203125,
+      "reward_std": 0.3050953149795532,
+      "rewards/accuracy_reward/mean": 0.3203125,
+      "rewards/accuracy_reward/std": 0.4684300124645233,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000169277191162,
+      "sampling/importance_sampling_ratio/min": 0.0002781494113150984,
+      "sampling/sampling_logp_difference/max": 8.187352180480957,
+      "sampling/sampling_logp_difference/mean": 0.019391046836972237,
+      "step": 141
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 2.7597974508353218e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.7597974508353218e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14216.0,
+      "completions/mean_length": 5690.5546875,
+      "completions/mean_terminated_length": 5606.3544921875,
+      "completions/min_length": 1124.0,
+      "completions/min_terminated_length": 1124.0,
+      "entropy": 1.0098655670881271,
+      "epoch": 0.13063477460901565,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.001451602904126048,
+      "learning_rate": 1e-05,
+      "loss": 0.0444,
+      "num_tokens": 107539874.0,
+      "reward": 0.4296875,
+      "reward_std": 0.23304283618927002,
+      "rewards/accuracy_reward/mean": 0.4296875,
+      "rewards/accuracy_reward/std": 0.4969765841960907,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999307990074158,
+      "sampling/importance_sampling_ratio/min": 5.640022671116185e-09,
+      "sampling/sampling_logp_difference/max": 18.993377685546875,
+      "sampling/sampling_logp_difference/mean": 0.018607191741466522,
+      "step": 142
+    },
+    {
+      "clip_ratio/high_max": 1.2800467629858758e-05,
+      "clip_ratio/high_mean": 4.19954119479371e-06,
+      "clip_ratio/low_mean": 2.350350996493944e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.770305115973315e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15791.0,
+      "completions/max_terminated_length": 15791.0,
+      "completions/mean_length": 5471.1328125,
+      "completions/mean_terminated_length": 5471.1328125,
+      "completions/min_length": 4.0,
+      "completions/min_terminated_length": 4.0,
+      "entropy": 1.0413162112236023,
+      "epoch": 0.13155473781048757,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0023549250327050686,
+      "learning_rate": 1e-05,
+      "loss": 0.0411,
+      "num_tokens": 108260091.0,
+      "reward": 0.3203125,
+      "reward_std": 0.30061954259872437,
+      "rewards/accuracy_reward/mean": 0.3203125,
+      "rewards/accuracy_reward/std": 0.4684300124645233,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999832510948181,
+      "sampling/importance_sampling_ratio/min": 0.0011709182290360332,
+      "sampling/sampling_logp_difference/max": 6.749967098236084,
+      "sampling/sampling_logp_difference/mean": 0.020427243784070015,
+      "step": 143
+    },
+    {
+      "clip_ratio/high_max": 2.1983064925734652e-05,
+      "clip_ratio/high_mean": 5.495766231433663e-06,
+      "clip_ratio/low_mean": 4.361141452591255e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.9107180757346214e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16277.0,
+      "completions/mean_length": 6211.7421875,
+      "completions/mean_terminated_length": 6050.2783203125,
+      "completions/min_length": 622.0,
+      "completions/min_terminated_length": 622.0,
+      "entropy": 0.9706784337759018,
+      "epoch": 0.13247470101195952,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0017527056625112891,
+      "learning_rate": 1e-05,
+      "loss": 0.0686,
+      "num_tokens": 109073890.0,
+      "reward": 0.421875,
+      "reward_std": 0.29826050996780396,
+      "rewards/accuracy_reward/mean": 0.421875,
+      "rewards/accuracy_reward/std": 0.4957992732524872,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999092221260071,
+      "sampling/importance_sampling_ratio/min": 0.002898645820096135,
+      "sampling/sampling_logp_difference/max": 5.843511581420898,
+      "sampling/sampling_logp_difference/mean": 0.018898162990808487,
+      "step": 144
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 4.208964992358233e-05,
+      "clip_ratio/low_min": 3.9168990042526275e-06,
+      "clip_ratio/region_mean": 4.208964992358233e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14880.0,
+      "completions/mean_length": 6007.8984375,
+      "completions/mean_terminated_length": 5926.19677734375,
+      "completions/min_length": 156.0,
+      "completions/min_terminated_length": 156.0,
+      "entropy": 1.1967609524726868,
+      "epoch": 0.13339466421343146,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0007858420140109956,
+      "learning_rate": 1e-05,
+      "loss": 0.011,
+      "num_tokens": 109861813.0,
+      "reward": 0.296875,
+      "reward_std": 0.23486506938934326,
+      "rewards/accuracy_reward/mean": 0.296875,
+      "rewards/accuracy_reward/std": 0.45867621898651123,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999340772628784,
+      "sampling/importance_sampling_ratio/min": 3.294382011631569e-08,
+      "sampling/sampling_logp_difference/max": 17.22846221923828,
+      "sampling/sampling_logp_difference/mean": 0.021845955401659012,
+      "step": 145
+    },
+    {
+      "clip_ratio/high_max": 4.5118208618077915e-06,
+      "clip_ratio/high_mean": 1.1279552154519479e-06,
+      "clip_ratio/low_mean": 3.749712686840212e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.8625082197540905e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15838.0,
+      "completions/mean_length": 6800.9921875,
+      "completions/mean_terminated_length": 6725.53564453125,
+      "completions/min_length": 5.0,
+      "completions/min_terminated_length": 5.0,
+      "entropy": 1.0437887012958527,
+      "epoch": 0.1343146274149034,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0029428249690681696,
+      "learning_rate": 1e-05,
+      "loss": 0.0405,
+      "num_tokens": 110756572.0,
+      "reward": 0.265625,
+      "reward_std": 0.3248382806777954,
+      "rewards/accuracy_reward/mean": 0.265625,
+      "rewards/accuracy_reward/std": 0.44340085983276367,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999890327453613,
+      "sampling/importance_sampling_ratio/min": 0.0006329434108920395,
+      "sampling/sampling_logp_difference/max": 7.365129470825195,
+      "sampling/sampling_logp_difference/mean": 0.02010120078921318,
+      "step": 146
+    },
+    {
+      "clip_ratio/high_max": 1.427700522071973e-05,
+      "clip_ratio/high_mean": 3.5692513051799324e-06,
+      "clip_ratio/low_mean": 4.964020990883e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.320946092979284e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15834.0,
+      "completions/mean_length": 6309.4453125,
+      "completions/mean_terminated_length": 6230.1181640625,
+      "completions/min_length": 283.0,
+      "completions/min_terminated_length": 283.0,
+      "entropy": 0.9768906533718109,
+      "epoch": 0.13523459061637536,
+      "frac_reward_zero_std": 0.125,
+      "grad_norm": 0.002088683657348156,
+      "learning_rate": 1e-05,
+      "loss": 0.0316,
+      "num_tokens": 111585493.0,
+      "reward": 0.375,
+      "reward_std": 0.39796435832977295,
+      "rewards/accuracy_reward/mean": 0.375,
+      "rewards/accuracy_reward/std": 0.4860251843929291,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.000007152557373,
+      "sampling/importance_sampling_ratio/min": 0.009723234921693802,
+      "sampling/sampling_logp_difference/max": 4.633236885070801,
+      "sampling/sampling_logp_difference/mean": 0.020927833393216133,
+      "step": 147
+    },
+    {
+      "clip_ratio/high_max": 5.4841398196003865e-06,
+      "clip_ratio/high_mean": 1.3710349549000966e-06,
+      "clip_ratio/low_mean": 5.122006064084417e-05,
+      "clip_ratio/low_min": 3.785125954891555e-06,
+      "clip_ratio/region_mean": 5.25910957094311e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15209.0,
+      "completions/mean_length": 6221.859375,
+      "completions/mean_terminated_length": 6060.5556640625,
+      "completions/min_length": 83.0,
+      "completions/min_terminated_length": 83.0,
+      "entropy": 0.9212924689054489,
+      "epoch": 0.13615455381784727,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.002406956860795617,
+      "learning_rate": 1e-05,
+      "loss": 0.1051,
+      "num_tokens": 112400363.0,
+      "reward": 0.40625,
+      "reward_std": 0.31929677724838257,
+      "rewards/accuracy_reward/mean": 0.40625,
+      "rewards/accuracy_reward/std": 0.4930621087551117,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999701976776123,
+      "sampling/importance_sampling_ratio/min": 5.8308287407271564e-05,
+      "sampling/sampling_logp_difference/max": 9.74976634979248,
+      "sampling/sampling_logp_difference/mean": 0.018652018159627914,
+      "step": 148
+    },
+    {
+      "clip_ratio/high_max": 1.4568151755156578e-05,
+      "clip_ratio/high_mean": 3.6420379387891444e-06,
+      "clip_ratio/low_mean": 3.999794398623635e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.3639981413434725e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14997.0,
+      "completions/mean_length": 6942.8203125,
+      "completions/mean_terminated_length": 6716.232421875,
+      "completions/min_length": 200.0,
+      "completions/min_terminated_length": 200.0,
+      "entropy": 0.949538916349411,
+      "epoch": 0.13707451701931922,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0022962254006415606,
+      "learning_rate": 1e-05,
+      "loss": 0.0625,
+      "num_tokens": 113308748.0,
+      "reward": 0.375,
+      "reward_std": 0.3329663872718811,
+      "rewards/accuracy_reward/mean": 0.375,
+      "rewards/accuracy_reward/std": 0.4860251843929291,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999334812164307,
+      "sampling/importance_sampling_ratio/min": 0.00048810525913722813,
+      "sampling/sampling_logp_difference/max": 7.624979496002197,
+      "sampling/sampling_logp_difference/mean": 0.01939917355775833,
+      "step": 149
+    },
+    {
+      "clip_ratio/high_max": 8.786732450971613e-06,
+      "clip_ratio/high_mean": 2.196683112742903e-06,
+      "clip_ratio/low_mean": 5.562954720517155e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.7826231113722315e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15182.0,
+      "completions/mean_length": 6783.1796875,
+      "completions/mean_terminated_length": 6552.76025390625,
+      "completions/min_length": 7.0,
+      "completions/min_terminated_length": 7.0,
+      "entropy": 0.9774708449840546,
+      "epoch": 0.13799448022079117,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0020560629200190306,
+      "learning_rate": 1e-05,
+      "loss": 0.0473,
+      "num_tokens": 114196235.0,
+      "reward": 0.34375,
+      "reward_std": 0.3066929578781128,
+      "rewards/accuracy_reward/mean": 0.34375,
+      "rewards/accuracy_reward/std": 0.47682511806488037,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998990297317505,
+      "sampling/importance_sampling_ratio/min": 2.4757892447269114e-07,
+      "sampling/sampling_logp_difference/max": 15.211536407470703,
+      "sampling/sampling_logp_difference/mean": 0.019691556692123413,
+      "step": 150
+    },
+    {
+      "clip_ratio/high_max": 1.799483243303257e-05,
+      "clip_ratio/high_mean": 4.498708108258143e-06,
+      "clip_ratio/low_mean": 2.6389980291696702e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.0888688343111426e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15549.0,
+      "completions/mean_length": 5568.15625,
+      "completions/mean_terminated_length": 5396.4765625,
+      "completions/min_length": 271.0,
+      "completions/min_terminated_length": 271.0,
+      "entropy": 0.9303529411554337,
+      "epoch": 0.1389144434222631,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0022214846685528755,
+      "learning_rate": 1e-05,
+      "loss": 0.0187,
+      "num_tokens": 114928047.0,
+      "reward": 0.234375,
+      "reward_std": 0.2585597634315491,
+      "rewards/accuracy_reward/mean": 0.234375,
+      "rewards/accuracy_reward/std": 0.42527204751968384,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999408721923828,
+      "sampling/importance_sampling_ratio/min": 2.1446083337650634e-05,
+      "sampling/sampling_logp_difference/max": 10.749968528747559,
+      "sampling/sampling_logp_difference/mean": 0.01938418298959732,
+      "step": 151
+    },
+    {
+      "clip_ratio/high_max": 1.1957493370573502e-05,
+      "clip_ratio/high_mean": 2.9893733426433755e-06,
+      "clip_ratio/low_mean": 5.885063319510664e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 6.184000585562899e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15340.0,
+      "completions/max_terminated_length": 15340.0,
+      "completions/mean_length": 6086.578125,
+      "completions/mean_terminated_length": 6086.578125,
+      "completions/min_length": 919.0,
+      "completions/min_terminated_length": 919.0,
+      "entropy": 0.9131873697042465,
+      "epoch": 0.13983440662373506,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.002448044717311859,
+      "learning_rate": 1e-05,
+      "loss": 0.0599,
+      "num_tokens": 115725657.0,
+      "reward": 0.40625,
+      "reward_std": 0.35878273844718933,
+      "rewards/accuracy_reward/mean": 0.40625,
+      "rewards/accuracy_reward/std": 0.4930621087551117,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999779462814331,
+      "sampling/importance_sampling_ratio/min": 0.02929726243019104,
+      "sampling/sampling_logp_difference/max": 3.530261278152466,
+      "sampling/sampling_logp_difference/mean": 0.019298439845442772,
+      "step": 152
+    },
+    {
+      "clip_ratio/high_max": 1.3385357760853367e-05,
+      "clip_ratio/high_mean": 3.3463394402133417e-06,
+      "clip_ratio/low_mean": 5.717015119444113e-05,
+      "clip_ratio/low_min": 3.4328400033700746e-06,
+      "clip_ratio/region_mean": 6.0516490520967636e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15987.0,
+      "completions/mean_length": 6442.5390625,
+      "completions/mean_terminated_length": 6203.9443359375,
+      "completions/min_length": 574.0,
+      "completions/min_terminated_length": 574.0,
+      "entropy": 0.8959419652819633,
+      "epoch": 0.140754369825207,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.002013204852119088,
+      "learning_rate": 1e-05,
+      "loss": 0.0281,
+      "num_tokens": 116571478.0,
+      "reward": 0.2734375,
+      "reward_std": 0.2909066081047058,
+      "rewards/accuracy_reward/mean": 0.2734375,
+      "rewards/accuracy_reward/std": 0.447474867105484,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.000044584274292,
+      "sampling/importance_sampling_ratio/min": 1.0374163821325055e-06,
+      "sampling/sampling_logp_difference/max": 13.778777122497559,
+      "sampling/sampling_logp_difference/mean": 0.01925014518201351,
+      "step": 153
+    },
+    {
+      "clip_ratio/high_max": 9.34224021875707e-06,
+      "clip_ratio/high_mean": 3.136903728773177e-06,
+      "clip_ratio/low_mean": 2.9738095065567904e-05,
+      "clip_ratio/low_min": 3.7240065466903616e-06,
+      "clip_ratio/region_mean": 3.2874999135401595e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15946.0,
+      "completions/mean_length": 6633.5703125,
+      "completions/mean_terminated_length": 6319.0400390625,
+      "completions/min_length": 358.0,
+      "completions/min_terminated_length": 358.0,
+      "entropy": 1.0223619118332863,
+      "epoch": 0.14167433302667892,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0024523327592760324,
+      "learning_rate": 1e-05,
+      "loss": 0.056,
+      "num_tokens": 117440743.0,
+      "reward": 0.3203125,
+      "reward_std": 0.30061954259872437,
+      "rewards/accuracy_reward/mean": 0.3203125,
+      "rewards/accuracy_reward/std": 0.4684300124645233,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999213218688965,
+      "sampling/importance_sampling_ratio/min": 3.0026931199245155e-05,
+      "sampling/sampling_logp_difference/max": 10.413415908813477,
+      "sampling/sampling_logp_difference/mean": 0.02061290666460991,
+      "step": 154
+    },
+    {
+      "clip_ratio/high_max": 1.4537483366439119e-05,
+      "clip_ratio/high_mean": 3.6343708416097797e-06,
+      "clip_ratio/low_mean": 3.954866042477079e-05,
+      "clip_ratio/low_min": 9.874949228105834e-06,
+      "clip_ratio/region_mean": 4.318303126638057e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15919.0,
+      "completions/mean_length": 7183.0,
+      "completions/mean_terminated_length": 6886.193359375,
+      "completions/min_length": 357.0,
+      "completions/min_terminated_length": 357.0,
+      "entropy": 0.9815369099378586,
+      "epoch": 0.14259429622815087,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0018688985146582127,
+      "learning_rate": 1e-05,
+      "loss": 0.0395,
+      "num_tokens": 118380687.0,
+      "reward": 0.2890625,
+      "reward_std": 0.2498900145292282,
+      "rewards/accuracy_reward/mean": 0.2890625,
+      "rewards/accuracy_reward/std": 0.45510825514793396,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999039173126221,
+      "sampling/importance_sampling_ratio/min": 1.3847662557964213e-05,
+      "sampling/sampling_logp_difference/max": 11.187394142150879,
+      "sampling/sampling_logp_difference/mean": 0.019792160019278526,
+      "step": 155
+    },
+    {
+      "clip_ratio/high_max": 7.165636361605721e-06,
+      "clip_ratio/high_mean": 1.7914090904014301e-06,
+      "clip_ratio/low_mean": 4.9011068711024564e-05,
+      "clip_ratio/low_min": 1.0991705721608014e-05,
+      "clip_ratio/region_mean": 5.0802477687739156e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16246.0,
+      "completions/mean_length": 6324.640625,
+      "completions/mean_terminated_length": 5829.91748046875,
+      "completions/min_length": 45.0,
+      "completions/min_terminated_length": 45.0,
+      "entropy": 0.852975606918335,
+      "epoch": 0.14351425942962281,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.002005894435569644,
+      "learning_rate": 1e-05,
+      "loss": 0.0041,
+      "num_tokens": 119207089.0,
+      "reward": 0.3984375,
+      "reward_std": 0.2858940362930298,
+      "rewards/accuracy_reward/mean": 0.3984375,
+      "rewards/accuracy_reward/std": 0.4915000796318054,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000035762786865,
+      "sampling/importance_sampling_ratio/min": 5.788659223071591e-07,
+      "sampling/sampling_logp_difference/max": 14.362195014953613,
+      "sampling/sampling_logp_difference/mean": 0.01853565312922001,
+      "step": 156
+    },
+    {
+      "clip_ratio/high_max": 7.795394822096569e-06,
+      "clip_ratio/high_mean": 1.948848705524142e-06,
+      "clip_ratio/low_mean": 3.834237736555224e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.0291225786859286e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16281.0,
+      "completions/mean_length": 5723.421875,
+      "completions/mean_terminated_length": 5290.06494140625,
+      "completions/min_length": 355.0,
+      "completions/min_terminated_length": 355.0,
+      "entropy": 0.8744911625981331,
+      "epoch": 0.14443422263109476,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.002577397273853421,
+      "learning_rate": 1e-05,
+      "loss": 0.0603,
+      "num_tokens": 119961895.0,
+      "reward": 0.390625,
+      "reward_std": 0.34321609139442444,
+      "rewards/accuracy_reward/mean": 0.390625,
+      "rewards/accuracy_reward/std": 0.4898075461387634,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999703764915466,
+      "sampling/importance_sampling_ratio/min": 0.07882421463727951,
+      "sampling/sampling_logp_difference/max": 2.5405349731445312,
+      "sampling/sampling_logp_difference/mean": 0.018341556191444397,
+      "step": 157
+    },
+    {
+      "clip_ratio/high_max": 9.214097190124448e-06,
+      "clip_ratio/high_mean": 2.303524297531112e-06,
+      "clip_ratio/low_mean": 2.636873176697918e-05,
+      "clip_ratio/low_min": 2.9339967113628518e-06,
+      "clip_ratio/region_mean": 2.8672255837136618e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16055.0,
+      "completions/mean_length": 7886.015625,
+      "completions/mean_terminated_length": 7682.064453125,
+      "completions/min_length": 989.0,
+      "completions/min_terminated_length": 989.0,
+      "entropy": 0.9391767829656601,
+      "epoch": 0.1453541858325667,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.002552987542003393,
+      "learning_rate": 1e-05,
+      "loss": 0.0273,
+      "num_tokens": 120990289.0,
+      "reward": 0.328125,
+      "reward_std": 0.2522490322589874,
+      "rewards/accuracy_reward/mean": 0.328125,
+      "rewards/accuracy_reward/std": 0.4713755249977112,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000030994415283,
+      "sampling/importance_sampling_ratio/min": 0.000899312668479979,
+      "sampling/sampling_logp_difference/max": 7.013879776000977,
+      "sampling/sampling_logp_difference/mean": 0.02049873024225235,
+      "step": 158
+    },
+    {
+      "clip_ratio/high_max": 3.406416203688423e-05,
+      "clip_ratio/high_mean": 9.72330332160709e-06,
+      "clip_ratio/low_mean": 3.168332909808669e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.140663151019908e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16276.0,
+      "completions/mean_length": 6173.1640625,
+      "completions/mean_terminated_length": 6011.087890625,
+      "completions/min_length": 445.0,
+      "completions/min_terminated_length": 445.0,
+      "entropy": 0.9148785546422005,
+      "epoch": 0.14627414903403863,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.002678362652659416,
+      "learning_rate": 1e-05,
+      "loss": 0.039,
+      "num_tokens": 121797958.0,
+      "reward": 0.4140625,
+      "reward_std": 0.3608373999595642,
+      "rewards/accuracy_reward/mean": 0.4140625,
+      "rewards/accuracy_reward/std": 0.49449479579925537,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999265074729919,
+      "sampling/importance_sampling_ratio/min": 0.002013920107856393,
+      "sampling/sampling_logp_difference/max": 6.207672119140625,
+      "sampling/sampling_logp_difference/mean": 0.018977735191583633,
+      "step": 159
+    },
+    {
+      "clip_ratio/high_max": 1.8476588593330234e-05,
+      "clip_ratio/high_mean": 4.6191471483325586e-06,
+      "clip_ratio/low_mean": 4.459614581264759e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.9215293188353826e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15825.0,
+      "completions/mean_length": 6594.21875,
+      "completions/mean_terminated_length": 6196.259765625,
+      "completions/min_length": 196.0,
+      "completions/min_terminated_length": 196.0,
+      "entropy": 0.9486038386821747,
+      "epoch": 0.14719411223551057,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0033711253199726343,
+      "learning_rate": 1e-05,
+      "loss": 0.026,
+      "num_tokens": 122661170.0,
+      "reward": 0.3828125,
+      "reward_std": 0.30457615852355957,
+      "rewards/accuracy_reward/mean": 0.3828125,
+      "rewards/accuracy_reward/std": 0.4879830479621887,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998981356620789,
+      "sampling/importance_sampling_ratio/min": 0.0002968419576063752,
+      "sampling/sampling_logp_difference/max": 8.122310638427734,
+      "sampling/sampling_logp_difference/mean": 0.01938377134501934,
+      "step": 160
+    },
+    {
+      "clip_ratio/high_max": 7.97335997049231e-06,
+      "clip_ratio/high_mean": 2.7343705824023345e-06,
+      "clip_ratio/low_mean": 5.420079878604156e-05,
+      "clip_ratio/low_min": 4.594068286678521e-06,
+      "clip_ratio/region_mean": 5.693517005056492e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15928.0,
+      "completions/mean_length": 6533.9453125,
+      "completions/mean_terminated_length": 6377.595703125,
+      "completions/min_length": 524.0,
+      "completions/min_terminated_length": 524.0,
+      "entropy": 0.9986584335565567,
+      "epoch": 0.14811407543698252,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0017857529455795884,
+      "learning_rate": 1e-05,
+      "loss": 0.0804,
+      "num_tokens": 123518107.0,
+      "reward": 0.34375,
+      "reward_std": 0.3356248140335083,
+      "rewards/accuracy_reward/mean": 0.34375,
+      "rewards/accuracy_reward/std": 0.47682511806488037,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998549818992615,
+      "sampling/importance_sampling_ratio/min": 9.012701411847956e-06,
+      "sampling/sampling_logp_difference/max": 11.616875648498535,
+      "sampling/sampling_logp_difference/mean": 0.02010391652584076,
+      "step": 161
+    },
+    {
+      "clip_ratio/high_max": 4.470512521947967e-06,
+      "clip_ratio/high_mean": 1.1176281304869917e-06,
+      "clip_ratio/low_mean": 3.5141094485879876e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.625872295742738e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 13212.0,
+      "completions/mean_length": 5742.21875,
+      "completions/mean_terminated_length": 5658.42529296875,
+      "completions/min_length": 235.0,
+      "completions/min_terminated_length": 235.0,
+      "entropy": 1.0379670709371567,
+      "epoch": 0.14903403863845446,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0018227624241262674,
+      "learning_rate": 1e-05,
+      "loss": -0.0237,
+      "num_tokens": 124279031.0,
+      "reward": 0.21875,
+      "reward_std": 0.2869548797607422,
+      "rewards/accuracy_reward/mean": 0.21875,
+      "rewards/accuracy_reward/std": 0.41502299904823303,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998506903648376,
+      "sampling/importance_sampling_ratio/min": 0.0020977305248379707,
+      "sampling/sampling_logp_difference/max": 6.16689920425415,
+      "sampling/sampling_logp_difference/mean": 0.019987668842077255,
+      "step": 162
+    },
+    {
+      "clip_ratio/high_max": 1.0003542683989508e-05,
+      "clip_ratio/high_mean": 3.21091931709816e-06,
+      "clip_ratio/low_mean": 5.731009014198207e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 6.0521009800140746e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16354.0,
+      "completions/mean_length": 7584.703125,
+      "completions/mean_terminated_length": 7515.41748046875,
+      "completions/min_length": 884.0,
+      "completions/min_terminated_length": 884.0,
+      "entropy": 0.953459307551384,
+      "epoch": 0.1499540018399264,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.002219022251665592,
+      "learning_rate": 1e-05,
+      "loss": 0.0837,
+      "num_tokens": 125270761.0,
+      "reward": 0.359375,
+      "reward_std": 0.37033066153526306,
+      "rewards/accuracy_reward/mean": 0.359375,
+      "rewards/accuracy_reward/std": 0.481702595949173,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999880790710449,
+      "sampling/importance_sampling_ratio/min": 0.0024849213659763336,
+      "sampling/sampling_logp_difference/max": 5.997514247894287,
+      "sampling/sampling_logp_difference/mean": 0.020291510969400406,
+      "step": 163
+    },
+    {
+      "clip_ratio/high_max": 7.734669452474918e-06,
+      "clip_ratio/high_mean": 1.9336673631187296e-06,
+      "clip_ratio/low_mean": 3.1135301298945706e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.3068968605221016e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16133.0,
+      "completions/mean_length": 4714.671875,
+      "completions/mean_terminated_length": 4622.78759765625,
+      "completions/min_length": 371.0,
+      "completions/min_terminated_length": 371.0,
+      "entropy": 1.018719919025898,
+      "epoch": 0.15087396504139836,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0014189074281603098,
+      "learning_rate": 1e-05,
+      "loss": 0.0501,
+      "num_tokens": 125895279.0,
+      "reward": 0.3984375,
+      "reward_std": 0.28383445739746094,
+      "rewards/accuracy_reward/mean": 0.3984375,
+      "rewards/accuracy_reward/std": 0.4915000796318054,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999479651451111,
+      "sampling/importance_sampling_ratio/min": 4.017410901724361e-07,
+      "sampling/sampling_logp_difference/max": 14.727458000183105,
+      "sampling/sampling_logp_difference/mean": 0.018739396706223488,
+      "step": 164
+    },
+    {
+      "clip_ratio/high_max": 1.0069575182569679e-05,
+      "clip_ratio/high_mean": 2.5173937956424197e-06,
+      "clip_ratio/low_mean": 3.824179225375701e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.0759185367278405e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15913.0,
+      "completions/mean_length": 6316.140625,
+      "completions/mean_terminated_length": 6074.51220703125,
+      "completions/min_length": 751.0,
+      "completions/min_terminated_length": 751.0,
+      "entropy": 0.9325072392821312,
+      "epoch": 0.15179392824287027,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.001702460227534175,
+      "learning_rate": 1e-05,
+      "loss": 0.1007,
+      "num_tokens": 126722881.0,
+      "reward": 0.4609375,
+      "reward_std": 0.3056321144104004,
+      "rewards/accuracy_reward/mean": 0.4609375,
+      "rewards/accuracy_reward/std": 0.5004304051399231,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999539852142334,
+      "sampling/importance_sampling_ratio/min": 0.0012551364488899708,
+      "sampling/sampling_logp_difference/max": 6.680510997772217,
+      "sampling/sampling_logp_difference/mean": 0.01929408684372902,
+      "step": 165
+    },
+    {
+      "clip_ratio/high_max": 6.873041002108948e-06,
+      "clip_ratio/high_mean": 1.718260250527237e-06,
+      "clip_ratio/low_mean": 3.119859468370123e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.291685527528898e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15832.0,
+      "completions/mean_length": 4687.140625,
+      "completions/mean_terminated_length": 4595.03955078125,
+      "completions/min_length": 310.0,
+      "completions/min_terminated_length": 310.0,
+      "entropy": 1.0886607319116592,
+      "epoch": 0.15271389144434222,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0032931750174611807,
+      "learning_rate": 1e-05,
+      "loss": 0.0078,
+      "num_tokens": 127341715.0,
+      "reward": 0.28125,
+      "reward_std": 0.2835350036621094,
+      "rewards/accuracy_reward/mean": 0.28125,
+      "rewards/accuracy_reward/std": 0.4513758420944214,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999821186065674,
+      "sampling/importance_sampling_ratio/min": 0.0019364450126886368,
+      "sampling/sampling_logp_difference/max": 6.246901512145996,
+      "sampling/sampling_logp_difference/mean": 0.020621225237846375,
+      "step": 166
+    },
+    {
+      "clip_ratio/high_max": 1.773085250533768e-05,
+      "clip_ratio/high_mean": 4.43271312633442e-06,
+      "clip_ratio/low_mean": 4.30743207289197e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.7507033741567284e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14125.0,
+      "completions/mean_length": 5705.515625,
+      "completions/mean_terminated_length": 5449.232421875,
+      "completions/min_length": 4.0,
+      "completions/min_terminated_length": 4.0,
+      "entropy": 1.0523068830370903,
+      "epoch": 0.15363385464581417,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0031696646474301815,
+      "learning_rate": 1e-05,
+      "loss": -0.0414,
+      "num_tokens": 128093597.0,
+      "reward": 0.1953125,
+      "reward_std": 0.21778053045272827,
+      "rewards/accuracy_reward/mean": 0.1953125,
+      "rewards/accuracy_reward/std": 0.3979988098144531,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999619126319885,
+      "sampling/importance_sampling_ratio/min": 3.197810656274669e-05,
+      "sampling/sampling_logp_difference/max": 10.350459098815918,
+      "sampling/sampling_logp_difference/mean": 0.021961934864521027,
+      "step": 167
+    },
+    {
+      "clip_ratio/high_max": 1.885905066956184e-05,
+      "clip_ratio/high_mean": 4.71476266739046e-06,
+      "clip_ratio/low_mean": 5.0530389898995054e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.524515336219338e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15958.0,
+      "completions/mean_length": 6214.4921875,
+      "completions/mean_terminated_length": 6053.07177734375,
+      "completions/min_length": 533.0,
+      "completions/min_terminated_length": 533.0,
+      "entropy": 0.9371421113610268,
+      "epoch": 0.1545538178472861,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0023704832419753075,
+      "learning_rate": 1e-05,
+      "loss": 0.075,
+      "num_tokens": 128906948.0,
+      "reward": 0.40625,
+      "reward_std": 0.34139877557754517,
+      "rewards/accuracy_reward/mean": 0.40625,
+      "rewards/accuracy_reward/std": 0.4930621087551117,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.000023365020752,
+      "sampling/importance_sampling_ratio/min": 0.0003354824730195105,
+      "sampling/sampling_logp_difference/max": 7.999940872192383,
+      "sampling/sampling_logp_difference/mean": 0.01882763020694256,
+      "step": 168
+    },
+    {
+      "clip_ratio/high_max": 3.042072216885572e-05,
+      "clip_ratio/high_mean": 7.60518054221393e-06,
+      "clip_ratio/low_mean": 4.5897569179942366e-05,
+      "clip_ratio/low_min": 8.727477506909054e-06,
+      "clip_ratio/region_mean": 5.3502750233747065e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15865.0,
+      "completions/mean_length": 7127.0703125,
+      "completions/mean_terminated_length": 7054.18115234375,
+      "completions/min_length": 402.0,
+      "completions/min_terminated_length": 402.0,
+      "entropy": 0.9854387491941452,
+      "epoch": 0.15547378104875806,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.003370177699252963,
+      "learning_rate": 1e-05,
+      "loss": 0.1197,
+      "num_tokens": 129839813.0,
+      "reward": 0.359375,
+      "reward_std": 0.3329663574695587,
+      "rewards/accuracy_reward/mean": 0.359375,
+      "rewards/accuracy_reward/std": 0.481702595949173,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999907910823822,
+      "sampling/importance_sampling_ratio/min": 1.077816432371037e-05,
+      "sampling/sampling_logp_difference/max": 11.43798828125,
+      "sampling/sampling_logp_difference/mean": 0.019736800342798233,
+      "step": 169
+    },
+    {
+      "clip_ratio/high_max": 2.1401074718596647e-05,
+      "clip_ratio/high_mean": 6.243764005375851e-06,
+      "clip_ratio/low_mean": 3.2797592325550795e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.904135610355297e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15865.0,
+      "completions/mean_length": 6566.2890625,
+      "completions/mean_terminated_length": 6330.6640625,
+      "completions/min_length": 969.0,
+      "completions/min_terminated_length": 969.0,
+      "entropy": 0.7978609576821327,
+      "epoch": 0.15639374425023,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0026055986527353525,
+      "learning_rate": 1e-05,
+      "loss": 0.0661,
+      "num_tokens": 130698370.0,
+      "reward": 0.5,
+      "reward_std": 0.36295419931411743,
+      "rewards/accuracy_reward/mean": 0.5,
+      "rewards/accuracy_reward/std": 0.5019646286964417,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999133944511414,
+      "sampling/importance_sampling_ratio/min": 0.00031152591691352427,
+      "sampling/sampling_logp_difference/max": 8.074028015136719,
+      "sampling/sampling_logp_difference/mean": 0.01787097379565239,
+      "step": 170
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 2.0564424403346493e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.0564424403346493e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15576.0,
+      "completions/max_terminated_length": 15576.0,
+      "completions/mean_length": 7186.2890625,
+      "completions/mean_terminated_length": 7186.2890625,
+      "completions/min_length": 351.0,
+      "completions/min_terminated_length": 351.0,
+      "entropy": 1.0232757329940796,
+      "epoch": 0.15731370745170192,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.0023866184055805206,
+      "learning_rate": 1e-05,
+      "loss": 0.0683,
+      "num_tokens": 131637439.0,
+      "reward": 0.2734375,
+      "reward_std": 0.2059282809495926,
+      "rewards/accuracy_reward/mean": 0.2734375,
+      "rewards/accuracy_reward/std": 0.447474867105484,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999207258224487,
+      "sampling/importance_sampling_ratio/min": 0.0007378471200354397,
+      "sampling/sampling_logp_difference/max": 7.211773872375488,
+      "sampling/sampling_logp_difference/mean": 0.02137116715312004,
+      "step": 171
+    },
+    {
+      "clip_ratio/high_max": 4.037900725961663e-05,
+      "clip_ratio/high_mean": 1.0094751814904157e-05,
+      "clip_ratio/low_mean": 5.8380828136250784e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 6.847557995115494e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 13638.0,
+      "completions/mean_length": 5591.5703125,
+      "completions/mean_terminated_length": 5420.26220703125,
+      "completions/min_length": 635.0,
+      "completions/min_terminated_length": 635.0,
+      "entropy": 0.9335208311676979,
+      "epoch": 0.15823367065317387,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.003491115989163518,
+      "learning_rate": 1e-05,
+      "loss": 0.0699,
+      "num_tokens": 132371816.0,
+      "reward": 0.5,
+      "reward_std": 0.3406373858451843,
+      "rewards/accuracy_reward/mean": 0.5,
+      "rewards/accuracy_reward/std": 0.5019646286964417,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999891459941864,
+      "sampling/importance_sampling_ratio/min": 0.00012356207298580557,
+      "sampling/sampling_logp_difference/max": 8.998766899108887,
+      "sampling/sampling_logp_difference/mean": 0.018760837614536285,
+      "step": 172
+    },
+    {
+      "clip_ratio/high_max": 2.8378776733006816e-06,
+      "clip_ratio/high_mean": 7.094694183251704e-07,
+      "clip_ratio/low_mean": 4.4085751369493664e-05,
+      "clip_ratio/low_min": 6.7955093072669115e-06,
+      "clip_ratio/region_mean": 4.4795220674132e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16302.0,
+      "completions/mean_length": 7152.3828125,
+      "completions/mean_terminated_length": 6930.82421875,
+      "completions/min_length": 281.0,
+      "completions/min_terminated_length": 281.0,
+      "entropy": 1.1329835206270218,
+      "epoch": 0.15915363385464582,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.002830669516697526,
+      "learning_rate": 1e-05,
+      "loss": 0.0526,
+      "num_tokens": 133307297.0,
+      "reward": 0.28125,
+      "reward_std": 0.28801077604293823,
+      "rewards/accuracy_reward/mean": 0.28125,
+      "rewards/accuracy_reward/std": 0.4513758420944214,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999501705169678,
+      "sampling/importance_sampling_ratio/min": 0.00028047082014381886,
+      "sampling/sampling_logp_difference/max": 8.179040908813477,
+      "sampling/sampling_logp_difference/mean": 0.021548541262745857,
+      "step": 173
+    },
+    {
+      "clip_ratio/high_max": 1.0150829439226072e-05,
+      "clip_ratio/high_mean": 2.537707359806518e-06,
+      "clip_ratio/low_mean": 3.4009618616437365e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.654732597624388e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15068.0,
+      "completions/mean_length": 7263.453125,
+      "completions/mean_terminated_length": 7118.68310546875,
+      "completions/min_length": 352.0,
+      "completions/min_terminated_length": 352.0,
+      "entropy": 1.092760555446148,
+      "epoch": 0.16007359705611776,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0027821618132293224,
+      "learning_rate": 1e-05,
+      "loss": 0.0541,
+      "num_tokens": 134260107.0,
+      "reward": 0.3203125,
+      "reward_std": 0.2858940362930298,
+      "rewards/accuracy_reward/mean": 0.3203125,
+      "rewards/accuracy_reward/std": 0.4684300124645233,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999946117401123,
+      "sampling/importance_sampling_ratio/min": 7.832317351130769e-05,
+      "sampling/sampling_logp_difference/max": 9.454667091369629,
+      "sampling/sampling_logp_difference/mean": 0.022098438814282417,
+      "step": 174
+    },
+    {
+      "clip_ratio/high_max": 1.0561876024439698e-05,
+      "clip_ratio/high_mean": 2.6404690061099245e-06,
+      "clip_ratio/low_mean": 1.6864279416495265e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 1.9504748649978865e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15388.0,
+      "completions/mean_length": 7088.8125,
+      "completions/mean_terminated_length": 6710.958984375,
+      "completions/min_length": 1314.0,
+      "completions/min_terminated_length": 1314.0,
+      "entropy": 1.0669445469975471,
+      "epoch": 0.1609935602575897,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.0007076738984324038,
+      "learning_rate": 1e-05,
+      "loss": -0.0197,
+      "num_tokens": 135186139.0,
+      "reward": 0.328125,
+      "reward_std": 0.20593319833278656,
+      "rewards/accuracy_reward/mean": 0.328125,
+      "rewards/accuracy_reward/std": 0.4713755249977112,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998199343681335,
+      "sampling/importance_sampling_ratio/min": 3.084653872065246e-05,
+      "sampling/sampling_logp_difference/max": 10.386486053466797,
+      "sampling/sampling_logp_difference/mean": 0.020075790584087372,
+      "step": 175
+    },
+    {
+      "clip_ratio/high_max": 7.095016371749807e-06,
+      "clip_ratio/high_mean": 1.7737540929374518e-06,
+      "clip_ratio/low_mean": 2.7592465016823553e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.936621888238733e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15626.0,
+      "completions/max_terminated_length": 15626.0,
+      "completions/mean_length": 5352.734375,
+      "completions/mean_terminated_length": 5352.734375,
+      "completions/min_length": 333.0,
+      "completions/min_terminated_length": 333.0,
+      "entropy": 1.0387161895632744,
+      "epoch": 0.16191352345906163,
+      "frac_reward_zero_std": 0.125,
+      "grad_norm": 0.0022445612121373415,
+      "learning_rate": 1e-05,
+      "loss": 0.0261,
+      "num_tokens": 135888929.0,
+      "reward": 0.4765625,
+      "reward_std": 0.399257630109787,
+      "rewards/accuracy_reward/mean": 0.4765625,
+      "rewards/accuracy_reward/std": 0.5014128684997559,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999054670333862,
+      "sampling/importance_sampling_ratio/min": 0.00032565294532105327,
+      "sampling/sampling_logp_difference/max": 8.029678344726562,
+      "sampling/sampling_logp_difference/mean": 0.02010166086256504,
+      "step": 176
+    },
+    {
+      "clip_ratio/high_max": 1.5100852124305675e-05,
+      "clip_ratio/high_mean": 4.426987970873597e-06,
+      "clip_ratio/low_mean": 2.7625993425317574e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.2052981168817496e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16266.0,
+      "completions/mean_length": 7758.90625,
+      "completions/mean_terminated_length": 7408.29248046875,
+      "completions/min_length": 742.0,
+      "completions/min_terminated_length": 742.0,
+      "entropy": 1.0648984238505363,
+      "epoch": 0.16283348666053357,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0022021254990249872,
+      "learning_rate": 1e-05,
+      "loss": 0.0621,
+      "num_tokens": 136901941.0,
+      "reward": 0.3671875,
+      "reward_std": 0.2914257347583771,
+      "rewards/accuracy_reward/mean": 0.3671875,
+      "rewards/accuracy_reward/std": 0.4839322865009308,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999858140945435,
+      "sampling/importance_sampling_ratio/min": 2.2461865967216e-07,
+      "sampling/sampling_logp_difference/max": 15.30886173248291,
+      "sampling/sampling_logp_difference/mean": 0.021426808089017868,
+      "step": 177
+    },
+    {
+      "clip_ratio/high_max": 2.5346608254039893e-05,
+      "clip_ratio/high_mean": 7.4063813144675805e-06,
+      "clip_ratio/low_mean": 2.2069365058996482e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.9475746259777225e-05,
+      "completions/clipped_ratio": 0.0546875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16277.0,
+      "completions/mean_length": 7036.953125,
+      "completions/mean_terminated_length": 6496.21484375,
+      "completions/min_length": 4.0,
+      "completions/min_terminated_length": 4.0,
+      "entropy": 0.9684997871518135,
+      "epoch": 0.16375344986200552,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0013461806811392307,
+      "learning_rate": 1e-05,
+      "loss": 0.035,
+      "num_tokens": 137824623.0,
+      "reward": 0.34375,
+      "reward_std": 0.2546031177043915,
+      "rewards/accuracy_reward/mean": 0.34375,
+      "rewards/accuracy_reward/std": 0.47682511806488037,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999944806098938,
+      "sampling/importance_sampling_ratio/min": 5.834372132085264e-05,
+      "sampling/sampling_logp_difference/max": 9.74915885925293,
+      "sampling/sampling_logp_difference/mean": 0.020304443314671516,
+      "step": 178
+    },
+    {
+      "clip_ratio/high_max": 1.3147734080121154e-05,
+      "clip_ratio/high_mean": 3.2869335200302885e-06,
+      "clip_ratio/low_mean": 4.841489999307669e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.170183294467279e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15500.0,
+      "completions/mean_length": 6114.1875,
+      "completions/mean_terminated_length": 5951.1748046875,
+      "completions/min_length": 223.0,
+      "completions/min_terminated_length": 223.0,
+      "entropy": 0.943072073161602,
+      "epoch": 0.16467341306347746,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.002132438588887453,
+      "learning_rate": 1e-05,
+      "loss": 0.0943,
+      "num_tokens": 138625247.0,
+      "reward": 0.40625,
+      "reward_std": 0.321650892496109,
+      "rewards/accuracy_reward/mean": 0.40625,
+      "rewards/accuracy_reward/std": 0.4930621087551117,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999298453330994,
+      "sampling/importance_sampling_ratio/min": 0.0017275095451623201,
+      "sampling/sampling_logp_difference/max": 6.361074447631836,
+      "sampling/sampling_logp_difference/mean": 0.020084267482161522,
+      "step": 179
+    },
+    {
+      "clip_ratio/high_max": 1.7873157958092634e-05,
+      "clip_ratio/high_mean": 4.468289489523158e-06,
+      "clip_ratio/low_mean": 3.5252990301160025e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.9721279790683184e-05,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15050.0,
+      "completions/mean_length": 7618.875,
+      "completions/mean_terminated_length": 7034.53369140625,
+      "completions/min_length": 1030.0,
+      "completions/min_terminated_length": 1030.0,
+      "entropy": 0.9142575263977051,
+      "epoch": 0.1655933762649494,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0026741649489849806,
+      "learning_rate": 1e-05,
+      "loss": 0.0666,
+      "num_tokens": 139619287.0,
+      "reward": 0.2890625,
+      "reward_std": 0.2927239239215851,
+      "rewards/accuracy_reward/mean": 0.2890625,
+      "rewards/accuracy_reward/std": 0.45510825514793396,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998897314071655,
+      "sampling/importance_sampling_ratio/min": 0.005949751473963261,
+      "sampling/sampling_logp_difference/max": 5.124405860900879,
+      "sampling/sampling_logp_difference/mean": 0.020061582326889038,
+      "step": 180
+    },
+    {
+      "clip_ratio/high_max": 1.0512151675357018e-05,
+      "clip_ratio/high_mean": 2.6280379188392544e-06,
+      "clip_ratio/low_mean": 4.5301517502593924e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.792955542143318e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 16106.0,
+      "completions/max_terminated_length": 16106.0,
+      "completions/mean_length": 5333.875,
+      "completions/mean_terminated_length": 5333.875,
+      "completions/min_length": 1109.0,
+      "completions/min_terminated_length": 1109.0,
+      "entropy": 0.8107482865452766,
+      "epoch": 0.16651333946642136,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0027016003150492907,
+      "learning_rate": 1e-05,
+      "loss": 0.0544,
+      "num_tokens": 140318935.0,
+      "reward": 0.5703125,
+      "reward_std": 0.2556639611721039,
+      "rewards/accuracy_reward/mean": 0.5703125,
+      "rewards/accuracy_reward/std": 0.4969765841960907,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.000013828277588,
+      "sampling/importance_sampling_ratio/min": 0.006856904830783606,
+      "sampling/sampling_logp_difference/max": 4.982499122619629,
+      "sampling/sampling_logp_difference/mean": 0.017069874331355095,
+      "step": 181
+    },
+    {
+      "clip_ratio/high_max": 1.85085939392593e-05,
+      "clip_ratio/high_mean": 5.24943533264377e-06,
+      "clip_ratio/low_mean": 5.6120721524166584e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 6.137015702734061e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16050.0,
+      "completions/mean_length": 7443.3046875,
+      "completions/mean_terminated_length": 7154.89501953125,
+      "completions/min_length": 19.0,
+      "completions/min_terminated_length": 19.0,
+      "entropy": 0.9224414080381393,
+      "epoch": 0.16743330266789327,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.002655779244378209,
+      "learning_rate": 1e-05,
+      "loss": 0.0466,
+      "num_tokens": 141293534.0,
+      "reward": 0.234375,
+      "reward_std": 0.2398776262998581,
+      "rewards/accuracy_reward/mean": 0.234375,
+      "rewards/accuracy_reward/std": 0.42527204751968384,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999659061431885,
+      "sampling/importance_sampling_ratio/min": 0.00042018835665658116,
+      "sampling/sampling_logp_difference/max": 7.774807453155518,
+      "sampling/sampling_logp_difference/mean": 0.02006504125893116,
+      "step": 182
+    },
+    {
+      "clip_ratio/high_max": 1.494229445597739e-05,
+      "clip_ratio/high_mean": 3.7355736139943474e-06,
+      "clip_ratio/low_mean": 2.2748562741981004e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.6484136355975352e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15923.0,
+      "completions/max_terminated_length": 15923.0,
+      "completions/mean_length": 5646.6875,
+      "completions/mean_terminated_length": 5646.6875,
+      "completions/min_length": 342.0,
+      "completions/min_terminated_length": 342.0,
+      "entropy": 0.8945339694619179,
+      "epoch": 0.16835326586936522,
+      "frac_reward_zero_std": 0.625,
+      "grad_norm": 0.0016281780553981662,
+      "learning_rate": 1e-05,
+      "loss": 0.0288,
+      "num_tokens": 142037438.0,
+      "reward": 0.46875,
+      "reward_std": 0.17912296950817108,
+      "rewards/accuracy_reward/mean": 0.46875,
+      "rewards/accuracy_reward/std": 0.5009832978248596,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.000030517578125,
+      "sampling/importance_sampling_ratio/min": 0.0005717006279155612,
+      "sampling/sampling_logp_difference/max": 7.46689510345459,
+      "sampling/sampling_logp_difference/mean": 0.019336247816681862,
+      "step": 183
+    },
+    {
+      "clip_ratio/high_max": 3.335990868436056e-05,
+      "clip_ratio/high_mean": 8.33997717109014e-06,
+      "clip_ratio/low_mean": 3.5050728683927446e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.339070608239126e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14142.0,
+      "completions/mean_length": 6384.640625,
+      "completions/mean_terminated_length": 5892.86865234375,
+      "completions/min_length": 130.0,
+      "completions/min_terminated_length": 130.0,
+      "entropy": 0.840093269944191,
+      "epoch": 0.16927322907083717,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.002166559686884284,
+      "learning_rate": 1e-05,
+      "loss": 0.0011,
+      "num_tokens": 142873848.0,
+      "reward": 0.4765625,
+      "reward_std": 0.35506346821784973,
+      "rewards/accuracy_reward/mean": 0.4765625,
+      "rewards/accuracy_reward/std": 0.5014128684997559,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000462532043457,
+      "sampling/importance_sampling_ratio/min": 4.785555574926548e-06,
+      "sampling/sampling_logp_difference/max": 12.249908447265625,
+      "sampling/sampling_logp_difference/mean": 0.018109092488884926,
+      "step": 184
+    },
+    {
+      "clip_ratio/high_max": 1.541105484648142e-05,
+      "clip_ratio/high_mean": 3.852763711620355e-06,
+      "clip_ratio/low_mean": 4.0552770769863855e-05,
+      "clip_ratio/low_min": 7.133888630050933e-06,
+      "clip_ratio/region_mean": 4.440553459517105e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14828.0,
+      "completions/mean_length": 5775.0,
+      "completions/mean_terminated_length": 5691.46435546875,
+      "completions/min_length": 1147.0,
+      "completions/min_terminated_length": 1147.0,
+      "entropy": 0.8915362879633904,
+      "epoch": 0.1701931922723091,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0021932912059128284,
+      "learning_rate": 1e-05,
+      "loss": -0.0086,
+      "num_tokens": 143636152.0,
+      "reward": 0.4375,
+      "reward_std": 0.32325342297554016,
+      "rewards/accuracy_reward/mean": 0.4375,
+      "rewards/accuracy_reward/std": 0.49802759289741516,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.000008225440979,
+      "sampling/importance_sampling_ratio/min": 9.714113069492214e-09,
+      "sampling/sampling_logp_difference/max": 18.44968605041504,
+      "sampling/sampling_logp_difference/mean": 0.019278086721897125,
+      "step": 185
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 2.7509142171311396e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.7509142171311396e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15122.0,
+      "completions/mean_length": 6181.640625,
+      "completions/mean_terminated_length": 6019.69873046875,
+      "completions/min_length": 473.0,
+      "completions/min_terminated_length": 473.0,
+      "entropy": 1.0544511675834656,
+      "epoch": 0.17111315547378106,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.0022947140969336033,
+      "learning_rate": 1e-05,
+      "loss": 0.0242,
+      "num_tokens": 144447370.0,
+      "reward": 0.234375,
+      "reward_std": 0.2022808939218521,
+      "rewards/accuracy_reward/mean": 0.234375,
+      "rewards/accuracy_reward/std": 0.42527204751968384,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999147653579712,
+      "sampling/importance_sampling_ratio/min": 7.419757253046555e-08,
+      "sampling/sampling_logp_difference/max": 16.416534423828125,
+      "sampling/sampling_logp_difference/mean": 0.02050788700580597,
+      "step": 186
+    },
+    {
+      "clip_ratio/high_max": 1.5700999938417226e-05,
+      "clip_ratio/high_mean": 3.9252499846043065e-06,
+      "clip_ratio/low_mean": 2.4595847037289786e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.8521096965050674e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15824.0,
+      "completions/mean_length": 6542.3046875,
+      "completions/mean_terminated_length": 6306.1044921875,
+      "completions/min_length": 628.0,
+      "completions/min_terminated_length": 628.0,
+      "entropy": 0.933225467801094,
+      "epoch": 0.17203311867525298,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0034910975955426693,
+      "learning_rate": 1e-05,
+      "loss": 0.0977,
+      "num_tokens": 145303505.0,
+      "reward": 0.390625,
+      "reward_std": 0.30433881282806396,
+      "rewards/accuracy_reward/mean": 0.390625,
+      "rewards/accuracy_reward/std": 0.4898075461387634,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999945163726807,
+      "sampling/importance_sampling_ratio/min": 0.007213745731860399,
+      "sampling/sampling_logp_difference/max": 4.931766986846924,
+      "sampling/sampling_logp_difference/mean": 0.020022759214043617,
+      "step": 187
+    },
+    {
+      "clip_ratio/high_max": 6.0999414017715026e-06,
+      "clip_ratio/high_mean": 1.5249853504428756e-06,
+      "clip_ratio/low_mean": 2.61421698724007e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.7667155109156738e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15365.0,
+      "completions/mean_length": 5889.4765625,
+      "completions/mean_terminated_length": 5637.6083984375,
+      "completions/min_length": 99.0,
+      "completions/min_terminated_length": 99.0,
+      "entropy": 0.9649673849344254,
+      "epoch": 0.17295308187672492,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0024078311398625374,
+      "learning_rate": 1e-05,
+      "loss": 0.0391,
+      "num_tokens": 146082198.0,
+      "reward": 0.3359375,
+      "reward_std": 0.2698703408241272,
+      "rewards/accuracy_reward/mean": 0.3359375,
+      "rewards/accuracy_reward/std": 0.47417303919792175,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999341368675232,
+      "sampling/importance_sampling_ratio/min": 0.0008680344326421618,
+      "sampling/sampling_logp_difference/max": 7.04927921295166,
+      "sampling/sampling_logp_difference/mean": 0.02060198038816452,
+      "step": 188
+    },
+    {
+      "clip_ratio/high_max": 7.789618393871933e-06,
+      "clip_ratio/high_mean": 1.9474045984679833e-06,
+      "clip_ratio/low_mean": 3.6395756637830345e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.834316100892465e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16233.0,
+      "completions/mean_length": 5349.2421875,
+      "completions/mean_terminated_length": 5084.408203125,
+      "completions/min_length": 678.0,
+      "completions/min_terminated_length": 678.0,
+      "entropy": 0.8402756005525589,
+      "epoch": 0.17387304507819687,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0021191861014813185,
+      "learning_rate": 1e-05,
+      "loss": 0.1275,
+      "num_tokens": 146786245.0,
+      "reward": 0.4765625,
+      "reward_std": 0.2801200747489929,
+      "rewards/accuracy_reward/mean": 0.4765625,
+      "rewards/accuracy_reward/std": 0.5014128684997559,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999837875366211,
+      "sampling/importance_sampling_ratio/min": 3.763807762879878e-05,
+      "sampling/sampling_logp_difference/max": 10.187494277954102,
+      "sampling/sampling_logp_difference/mean": 0.017112664878368378,
+      "step": 189
+    },
+    {
+      "clip_ratio/high_max": 1.2461773394534248e-05,
+      "clip_ratio/high_mean": 3.115443348633562e-06,
+      "clip_ratio/low_mean": 5.095924211673264e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.4074685294835945e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15786.0,
+      "completions/mean_length": 7272.3203125,
+      "completions/mean_terminated_length": 7053.64013671875,
+      "completions/min_length": 1074.0,
+      "completions/min_terminated_length": 1074.0,
+      "entropy": 0.9627499282360077,
+      "epoch": 0.17479300827966882,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0022120666690170765,
+      "learning_rate": 1e-05,
+      "loss": 0.0079,
+      "num_tokens": 147737086.0,
+      "reward": 0.2890625,
+      "reward_std": 0.27304792404174805,
+      "rewards/accuracy_reward/mean": 0.2890625,
+      "rewards/accuracy_reward/std": 0.45510825514793396,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999538660049438,
+      "sampling/importance_sampling_ratio/min": 1.6960719221970066e-05,
+      "sampling/sampling_logp_difference/max": 10.984610557556152,
+      "sampling/sampling_logp_difference/mean": 0.0203307643532753,
+      "step": 190
+    },
+    {
+      "clip_ratio/high_max": 1.7891727566166082e-05,
+      "clip_ratio/high_mean": 4.472931891541521e-06,
+      "clip_ratio/low_mean": 5.616715043288423e-05,
+      "clip_ratio/low_min": 7.80031223257538e-06,
+      "clip_ratio/region_mean": 6.064008221073891e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16212.0,
+      "completions/mean_length": 6387.1875,
+      "completions/mean_terminated_length": 5895.54052734375,
+      "completions/min_length": 1310.0,
+      "completions/min_terminated_length": 1310.0,
+      "entropy": 0.9110158830881119,
+      "epoch": 0.17571297148114076,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0030851473566144705,
+      "learning_rate": 1e-05,
+      "loss": 0.1091,
+      "num_tokens": 148573782.0,
+      "reward": 0.40625,
+      "reward_std": 0.31246688961982727,
+      "rewards/accuracy_reward/mean": 0.40625,
+      "rewards/accuracy_reward/std": 0.4930621087551117,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.99997878074646,
+      "sampling/importance_sampling_ratio/min": 0.003961040172725916,
+      "sampling/sampling_logp_difference/max": 5.531248569488525,
+      "sampling/sampling_logp_difference/mean": 0.018049638718366623,
+      "step": 191
+    },
+    {
+      "clip_ratio/high_max": 1.6994396901282016e-05,
+      "clip_ratio/high_mean": 5.400205964178895e-06,
+      "clip_ratio/low_mean": 3.274822392995702e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.8148429439388565e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16176.0,
+      "completions/mean_length": 7267.59375,
+      "completions/mean_terminated_length": 7195.81103515625,
+      "completions/min_length": 653.0,
+      "completions/min_terminated_length": 653.0,
+      "entropy": 0.9254888147115707,
+      "epoch": 0.1766329346826127,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0020694085396826267,
+      "learning_rate": 1e-05,
+      "loss": 0.0462,
+      "num_tokens": 149521258.0,
+      "reward": 0.2734375,
+      "reward_std": 0.29719972610473633,
+      "rewards/accuracy_reward/mean": 0.2734375,
+      "rewards/accuracy_reward/std": 0.447474867105484,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999054670333862,
+      "sampling/importance_sampling_ratio/min": 7.411616934405174e-06,
+      "sampling/sampling_logp_difference/max": 11.812461853027344,
+      "sampling/sampling_logp_difference/mean": 0.01898832805454731,
+      "step": 192
+    }
+  ],
+  "logging_steps": 1,
+  "max_steps": 1024,
+  "num_input_tokens_seen": 149521258,
+  "num_train_epochs": 1,
+  "save_steps": 64,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 0.0,
+  "train_batch_size": 4,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/dapo_lora_plus_20251202_001141/checkpoint-192/zero_to_fp32.py b/dapo_lora_plus_20251202_001141/checkpoint-192/zero_to_fp32.py
new file mode 100644
index 0000000000000000000000000000000000000000..5995d6e6f04e43b989587aa9022a3aef0c66d694
--- /dev/null
+++ b/dapo_lora_plus_20251202_001141/checkpoint-192/zero_to_fp32.py
@@ -0,0 +1,760 @@
+#!/usr/bin/env python
+
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets
+# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in
+# the future. Once extracted, the weights don't require DeepSpeed and can be used in any
+# application.
+#
+# example:
+#   python zero_to_fp32.py . output_dir/
+#   or
+#   python zero_to_fp32.py . output_dir/ --safe_serialization
+
+import argparse
+import torch
+import glob
+import math
+import os
+import re
+import gc
+import json
+import numpy as np
+from tqdm import tqdm
+from collections import OrderedDict
+from dataclasses import dataclass
+
+# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with
+# DeepSpeed data structures it has to be available in the current python environment.
+from deepspeed.utils import logger
+from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS,
+                                            FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES,
+                                            FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS)
+
+
+@dataclass
+class zero_model_state:
+    buffers: dict()
+    param_shapes: dict()
+    shared_params: list
+    ds_version: int
+    frozen_param_shapes: dict()
+    frozen_param_fragments: dict()
+
+
+debug = 0
+
+# load to cpu
+device = torch.device('cpu')
+
+
+def atoi(text):
+    return int(text) if text.isdigit() else text
+
+
+def natural_keys(text):
+    '''
+    alist.sort(key=natural_keys) sorts in human order
+    http://nedbatchelder.com/blog/200712/human_sorting.html
+    (See Toothy's implementation in the comments)
+    '''
+    return [atoi(c) for c in re.split(r'(\d+)', text)]
+
+
+def get_model_state_file(checkpoint_dir, zero_stage):
+    if not os.path.isdir(checkpoint_dir):
+        raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist")
+
+    # there should be only one file
+    if zero_stage <= 2:
+        file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt")
+    elif zero_stage == 3:
+        file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt")
+
+    if not os.path.exists(file):
+        raise FileNotFoundError(f"can't find model states file at '{file}'")
+
+    return file
+
+
+def get_checkpoint_files(checkpoint_dir, glob_pattern):
+    # XXX: need to test that this simple glob rule works for multi-node setup too
+    ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys)
+
+    if len(ckpt_files) == 0:
+        raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'")
+
+    return ckpt_files
+
+
+def get_optim_files(checkpoint_dir):
+    return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt")
+
+
+def get_model_state_files(checkpoint_dir):
+    return get_checkpoint_files(checkpoint_dir, "*_model_states.pt")
+
+
+def parse_model_states(files):
+    zero_model_states = []
+    for file in files:
+        state_dict = torch.load(file, map_location=device, weights_only=False)
+
+        if BUFFER_NAMES not in state_dict:
+            raise ValueError(f"{file} is not a model state checkpoint")
+        buffer_names = state_dict[BUFFER_NAMES]
+        if debug:
+            print("Found buffers:", buffer_names)
+
+        # recover just the buffers while restoring them to fp32 if they were saved in fp16
+        buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names}
+        param_shapes = state_dict[PARAM_SHAPES]
+
+        # collect parameters that are included in param_shapes
+        param_names = []
+        for s in param_shapes:
+            for name in s.keys():
+                param_names.append(name)
+
+        # update with frozen parameters
+        frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None)
+        if frozen_param_shapes is not None:
+            if debug:
+                print(f"Found frozen_param_shapes: {frozen_param_shapes}")
+            param_names += list(frozen_param_shapes.keys())
+
+        # handle shared params
+        shared_params = [[k, v] for k, v in state_dict["shared_params"].items()]
+
+        ds_version = state_dict.get(DS_VERSION, None)
+
+        frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None)
+
+        z_model_state = zero_model_state(buffers=buffers,
+                                         param_shapes=param_shapes,
+                                         shared_params=shared_params,
+                                         ds_version=ds_version,
+                                         frozen_param_shapes=frozen_param_shapes,
+                                         frozen_param_fragments=frozen_param_fragments)
+        zero_model_states.append(z_model_state)
+
+    return zero_model_states
+
+
+def parse_optim_states(files, ds_checkpoint_dir):
+    total_files = len(files)
+    state_dicts = []
+    for f in tqdm(files, desc='Loading checkpoint shards'):
+        state_dict = torch.load(f, map_location=device, mmap=True, weights_only=False)
+        # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights
+        # and also handle the case where it was already removed by another helper script
+        state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None)
+        state_dicts.append(state_dict)
+
+    if ZERO_STAGE not in state_dicts[0][OPTIMIZER_STATE_DICT]:
+        raise ValueError(f"{files[0]} is not a zero checkpoint")
+    zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE]
+    world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT]
+
+    # For ZeRO-2 each param group can have different partition_count as data parallelism for expert
+    # parameters can be different from data parallelism for non-expert parameters. So we can just
+    # use the max of the partition_count to get the dp world_size.
+
+    if type(world_size) is list:
+        world_size = max(world_size)
+
+    if world_size != total_files:
+        raise ValueError(
+            f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. "
+            "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes."
+        )
+
+    # the groups are named differently in each stage
+    if zero_stage <= 2:
+        fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS
+    elif zero_stage == 3:
+        fp32_groups_key = FP32_FLAT_GROUPS
+    else:
+        raise ValueError(f"unknown zero stage {zero_stage}")
+
+    fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))]
+    return zero_stage, world_size, fp32_flat_groups
+
+
+def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters):
+    """
+    Returns fp32 state_dict reconstructed from ds checkpoint
+
+    Args:
+        - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are)
+
+    """
+    print(f"Processing zero checkpoint '{ds_checkpoint_dir}'")
+
+    optim_files = get_optim_files(ds_checkpoint_dir)
+    zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir)
+    print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}")
+
+    model_files = get_model_state_files(ds_checkpoint_dir)
+
+    zero_model_states = parse_model_states(model_files)
+    print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}')
+
+    if zero_stage <= 2:
+        return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+                                                          exclude_frozen_parameters)
+    elif zero_stage == 3:
+        return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+                                                          exclude_frozen_parameters)
+
+
+def _zero2_merge_frozen_params(state_dict, zero_model_states):
+    if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0:
+        return
+
+    frozen_param_shapes = zero_model_states[0].frozen_param_shapes
+    frozen_param_fragments = zero_model_states[0].frozen_param_fragments
+
+    if debug:
+        num_elem = sum(s.numel() for s in frozen_param_shapes.values())
+        print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}')
+
+        wanted_params = len(frozen_param_shapes)
+        wanted_numel = sum(s.numel() for s in frozen_param_shapes.values())
+        avail_numel = sum([p.numel() for p in frozen_param_fragments.values()])
+        print(f'Frozen params: Have {avail_numel} numels to process.')
+        print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params')
+
+    total_params = 0
+    total_numel = 0
+    for name, shape in frozen_param_shapes.items():
+        total_params += 1
+        unpartitioned_numel = shape.numel()
+        total_numel += unpartitioned_numel
+
+        state_dict[name] = frozen_param_fragments[name]
+
+        if debug:
+            print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ")
+
+    print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements")
+
+
+def _has_callable(obj, fn):
+    attr = getattr(obj, fn, None)
+    return callable(attr)
+
+
+def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states):
+    param_shapes = zero_model_states[0].param_shapes
+
+    # Reconstruction protocol:
+    #
+    # XXX: document this
+
+    if debug:
+        for i in range(world_size):
+            for j in range(len(fp32_flat_groups[0])):
+                print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}")
+
+    # XXX: memory usage doubles here (zero2)
+    num_param_groups = len(fp32_flat_groups[0])
+    merged_single_partition_of_fp32_groups = []
+    for i in range(num_param_groups):
+        merged_partitions = [sd[i] for sd in fp32_flat_groups]
+        full_single_fp32_vector = torch.cat(merged_partitions, 0)
+        merged_single_partition_of_fp32_groups.append(full_single_fp32_vector)
+    avail_numel = sum(
+        [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups])
+
+    if debug:
+        wanted_params = sum([len(shapes) for shapes in param_shapes])
+        wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes])
+        # not asserting if there is a mismatch due to possible padding
+        print(f"Have {avail_numel} numels to process.")
+        print(f"Need {wanted_numel} numels in {wanted_params} params.")
+
+    # params
+    # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support
+    # out-of-core computing solution
+    total_numel = 0
+    total_params = 0
+    for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups):
+        offset = 0
+        avail_numel = full_single_fp32_vector.numel()
+        for name, shape in shapes.items():
+
+            unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape)
+            total_numel += unpartitioned_numel
+            total_params += 1
+
+            if debug:
+                print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ")
+            state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape)
+            offset += unpartitioned_numel
+
+        # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and
+        # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex
+        # paddings performed in the code it's almost impossible to predict the exact numbers w/o the
+        # live optimizer object, so we are checking that the numbers are within the right range
+        align_to = 2 * world_size
+
+        def zero2_align(x):
+            return align_to * math.ceil(x / align_to)
+
+        if debug:
+            print(f"original offset={offset}, avail_numel={avail_numel}")
+
+        offset = zero2_align(offset)
+        avail_numel = zero2_align(avail_numel)
+
+        if debug:
+            print(f"aligned  offset={offset}, avail_numel={avail_numel}")
+
+        # Sanity check
+        if offset != avail_numel:
+            raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong")
+
+    print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements")
+
+
+def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+                                               exclude_frozen_parameters):
+    state_dict = OrderedDict()
+
+    # buffers
+    buffers = zero_model_states[0].buffers
+    state_dict.update(buffers)
+    if debug:
+        print(f"added {len(buffers)} buffers")
+
+    if not exclude_frozen_parameters:
+        _zero2_merge_frozen_params(state_dict, zero_model_states)
+
+    _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states)
+
+    # recover shared parameters
+    for pair in zero_model_states[0].shared_params:
+        if pair[1] in state_dict:
+            state_dict[pair[0]] = state_dict[pair[1]]
+
+    return state_dict
+
+
+def zero3_partitioned_param_info(unpartitioned_numel, world_size):
+    remainder = unpartitioned_numel % world_size
+    padding_numel = (world_size - remainder) if remainder else 0
+    partitioned_numel = math.ceil(unpartitioned_numel / world_size)
+    return partitioned_numel, padding_numel
+
+
+def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states):
+    if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0:
+        return
+
+    if debug:
+        for i in range(world_size):
+            num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values())
+            print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}')
+
+        frozen_param_shapes = zero_model_states[0].frozen_param_shapes
+        wanted_params = len(frozen_param_shapes)
+        wanted_numel = sum(s.numel() for s in frozen_param_shapes.values())
+        avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size
+        print(f'Frozen params: Have {avail_numel} numels to process.')
+        print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params')
+
+    total_params = 0
+    total_numel = 0
+    for name, shape in zero_model_states[0].frozen_param_shapes.items():
+        total_params += 1
+        unpartitioned_numel = shape.numel()
+        total_numel += unpartitioned_numel
+
+        param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states)
+        state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape)
+
+        partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size)
+
+        if debug:
+            print(
+                f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}"
+            )
+
+    print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements")
+
+
+class GatheredTensor:
+    """
+    A pseudo tensor that collects partitioned weights.
+    It is more memory efficient when there are multiple groups.
+    """
+
+    def __init__(self, flat_groups, flat_groups_offset, offset, partitioned_numel, shape):
+        self.flat_groups = flat_groups
+        self.flat_groups_offset = flat_groups_offset
+        self.offset = offset
+        self.partitioned_numel = partitioned_numel
+        self.shape = shape
+        self.dtype = self.flat_groups[0][0].dtype
+
+    def contiguous(self):
+        """
+        Merge partitioned weights from flat_groups into a single tensor.
+        """
+        end_idx = self.offset + self.partitioned_numel
+        world_size = len(self.flat_groups)
+        pad_flat_param_chunks = []
+
+        for rank_i in range(world_size):
+            # for each rank, we need to collect weights from related group/groups
+            flat_groups_at_rank_i = self.flat_groups[rank_i]
+            start_group_id = None
+            end_group_id = None
+            for group_id in range(len(self.flat_groups_offset)):
+                if self.flat_groups_offset[group_id] <= self.offset < self.flat_groups_offset[group_id + 1]:
+                    start_group_id = group_id
+                if self.flat_groups_offset[group_id] < end_idx <= self.flat_groups_offset[group_id + 1]:
+                    end_group_id = group_id
+                    break
+            # collect weights from related group/groups
+            for group_id in range(start_group_id, end_group_id + 1):
+                flat_tensor = flat_groups_at_rank_i[group_id]
+                start_offset = self.offset - self.flat_groups_offset[group_id]
+                end_offset = min(end_idx, self.flat_groups_offset[group_id + 1]) - self.flat_groups_offset[group_id]
+                pad_flat_param_chunks.append(flat_tensor[start_offset:end_offset])
+
+        # collect weights from all ranks
+        pad_flat_param = torch.cat(pad_flat_param_chunks, dim=0)
+        param = pad_flat_param[:self.shape.numel()].view(self.shape).contiguous()
+        return param
+
+
+def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states):
+    param_shapes = zero_model_states[0].param_shapes
+    avail_numel = sum([flat_group.numel() for flat_group in fp32_flat_groups[0]]) * world_size
+
+    # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each
+    # param, re-consolidating each param, while dealing with padding if any
+
+    # merge list of dicts, preserving order
+    param_shapes = {k: v for d in param_shapes for k, v in d.items()}
+
+    if debug:
+        for i in range(world_size):
+            print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}")
+
+        wanted_params = len(param_shapes)
+        wanted_numel = sum(shape.numel() for shape in param_shapes.values())
+        # not asserting if there is a mismatch due to possible padding
+        avail_numel = fp32_flat_groups[0].numel() * world_size
+        print(f"Trainable params: Have {avail_numel} numels to process.")
+        print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.")
+
+    # params
+    # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support
+    # out-of-core computing solution
+    offset = 0
+    total_numel = 0
+    total_params = 0
+    flat_groups_offset = [0] + list(np.cumsum([flat_tensor.numel() for flat_tensor in fp32_flat_groups[0]]))
+    for name, shape in tqdm(param_shapes.items(), desc='Gathering sharded weights'):
+        unpartitioned_numel = shape.numel()
+        total_numel += unpartitioned_numel
+        total_params += 1
+        partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size)
+
+        if debug:
+            print(
+                f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}"
+            )
+
+        # memory efficient tensor
+        tensor = GatheredTensor(fp32_flat_groups, flat_groups_offset, offset, partitioned_numel, shape)
+        state_dict[name] = tensor
+        offset += partitioned_numel
+
+    offset *= world_size
+
+    # Sanity check
+    if offset != avail_numel:
+        raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong")
+
+    print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements")
+
+
+def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+                                               exclude_frozen_parameters):
+    state_dict = OrderedDict()
+
+    # buffers
+    buffers = zero_model_states[0].buffers
+    state_dict.update(buffers)
+    if debug:
+        print(f"added {len(buffers)} buffers")
+
+    if not exclude_frozen_parameters:
+        _zero3_merge_frozen_params(state_dict, world_size, zero_model_states)
+
+    _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states)
+
+    # recover shared parameters
+    for pair in zero_model_states[0].shared_params:
+        if pair[1] in state_dict:
+            state_dict[pair[0]] = state_dict[pair[1]]
+
+    return state_dict
+
+
+def to_torch_tensor(state_dict, return_empty_tensor=False):
+    """
+    Convert state_dict of GatheredTensor to torch tensor
+    """
+    torch_state_dict = {}
+    converted_tensors = {}
+    for name, tensor in state_dict.items():
+        tensor_id = id(tensor)
+        if tensor_id in converted_tensors:  # shared tensors
+            shared_tensor = torch_state_dict[converted_tensors[tensor_id]]
+            torch_state_dict[name] = shared_tensor
+        else:
+            converted_tensors[tensor_id] = name
+            if return_empty_tensor:
+                torch_state_dict[name] = torch.empty(tensor.shape, dtype=tensor.dtype)
+            else:
+                torch_state_dict[name] = tensor.contiguous()
+    return torch_state_dict
+
+
+def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir,
+                                             tag=None,
+                                             exclude_frozen_parameters=False,
+                                             lazy_mode=False):
+    """
+    Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with
+    ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example
+    via a model hub.
+
+    Args:
+        - ``checkpoint_dir``: path to the desired checkpoint folder
+        - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14``
+        - ``exclude_frozen_parameters``: exclude frozen parameters
+        - ``lazy_mode``: get state_dict in lazy mode. It returns a dict of pesduo tensor instead of torch tensor, which is more memory efficient.
+          Convert the pesduo tensor to torch tensor by ``.contiguous()``
+
+    Returns:
+        - pytorch ``state_dict``
+
+    A typical usage might be ::
+
+        from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint
+        # do the training and checkpoint saving
+        state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu
+        model = model.cpu() # move to cpu
+        model.load_state_dict(state_dict)
+        # submit to model hub or save the model to share with others
+
+    In this example the ``model`` will no longer be usable in the deepspeed context of the same
+    application. i.e. you will need to re-initialize the deepspeed engine, since
+    ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it.
+
+    If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead.
+
+    Note: the above usage may not work if your application doesn't have sufficient free CPU memory.
+    You may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with
+    the checkpoint. Or you can load state_dict in lazy mode ::
+
+        from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint
+        state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, lazy_mode=True) # not on cpu
+        for name, lazy_tensor in state_dict.item():
+            tensor = lazy_tensor.contiguous()  # to cpu
+            print(name, tensor)
+            # del tensor to release memory if it no longer in use
+    """
+    if tag is None:
+        latest_path = os.path.join(checkpoint_dir, 'latest')
+        if os.path.isfile(latest_path):
+            with open(latest_path, 'r') as fd:
+                tag = fd.read().strip()
+        else:
+            raise ValueError(f"Unable to find 'latest' file at {latest_path}")
+
+    ds_checkpoint_dir = os.path.join(checkpoint_dir, tag)
+
+    if not os.path.isdir(ds_checkpoint_dir):
+        raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist")
+
+    state_dict = _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters)
+    if lazy_mode:
+        return state_dict
+    else:
+        return to_torch_tensor(state_dict)
+
+
+def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir,
+                                               output_dir,
+                                               max_shard_size="5GB",
+                                               safe_serialization=False,
+                                               tag=None,
+                                               exclude_frozen_parameters=False):
+    """
+    Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be
+    loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed.
+
+    Args:
+        - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``)
+        - ``output_dir``: directory to the pytorch fp32 state_dict output files
+        - ``max_shard_size``: the maximum size for a checkpoint before being sharded, default value is 5GB
+        - ``safe_serialization``:  whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).
+        - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14``
+        - ``exclude_frozen_parameters``: exclude frozen parameters
+    """
+
+    # Dependency pre-check
+    if safe_serialization:
+        try:
+            from safetensors.torch import save_file
+        except ImportError:
+            print('If you want to use `safe_serialization`, please `pip install safetensors`')
+            raise
+    if max_shard_size is not None:
+        try:
+            from huggingface_hub import split_torch_state_dict_into_shards
+        except ImportError:
+            print('If you want to use `max_shard_size`, please `pip install huggingface_hub`')
+            raise
+
+    # Convert zero checkpoint to state_dict
+    state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir,
+                                                          tag,
+                                                          exclude_frozen_parameters,
+                                                          lazy_mode=True)
+
+    # Shard the model if it is too big.
+    weights_name = "model.safetensors" if safe_serialization else "pytorch_model.bin"
+    if max_shard_size is not None:
+        filename_pattern = weights_name.replace(".bin", "{suffix}.bin").replace(".safetensors", "{suffix}.safetensors")
+        # an memory-efficient approach for sharding
+        empty_state_dict = to_torch_tensor(state_dict, return_empty_tensor=True)
+        state_dict_split = split_torch_state_dict_into_shards(empty_state_dict,
+                                                              filename_pattern=filename_pattern,
+                                                              max_shard_size=max_shard_size)
+    else:
+        from collections import namedtuple
+        StateDictSplit = namedtuple("StateDictSplit", ["is_sharded", "filename_to_tensors"])
+        state_dict_split = StateDictSplit(is_sharded=False,
+                                          filename_to_tensors={weights_name: list(state_dict.keys())})
+
+    # Save the model by shard
+    os.makedirs(output_dir, exist_ok=True)
+    filename_to_tensors = state_dict_split.filename_to_tensors.items()
+    for shard_file, tensors in tqdm(filename_to_tensors, desc="Saving checkpoint shards"):
+        shard_state_dict = {tensor_name: state_dict[tensor_name] for tensor_name in tensors}
+        shard_state_dict = to_torch_tensor(shard_state_dict)
+        output_path = os.path.join(output_dir, shard_file)
+        if safe_serialization:
+            save_file(shard_state_dict, output_path, metadata={"format": "pt"})
+        else:
+            torch.save(shard_state_dict, output_path)
+        # release the memory of current shard
+        for tensor_name in list(shard_state_dict.keys()):
+            del state_dict[tensor_name]
+            del shard_state_dict[tensor_name]
+        del shard_state_dict
+        gc.collect()
+
+    # Save index if sharded
+    if state_dict_split.is_sharded:
+        index = {
+            "metadata": state_dict_split.metadata,
+            "weight_map": state_dict_split.tensor_to_filename,
+        }
+        save_index_file = "model.safetensors.index.json" if safe_serialization else "pytorch_model.bin.index.json"
+        save_index_file = os.path.join(output_dir, save_index_file)
+        with open(save_index_file, "w", encoding="utf-8") as f:
+            content = json.dumps(index, indent=2, sort_keys=True) + "\n"
+            f.write(content)
+
+
+def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None):
+    """
+    1. Put the provided model to cpu
+    2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict``
+    3. Load it into the provided model
+
+    Args:
+        - ``model``: the model object to update
+        - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``)
+        - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14``
+
+    Returns:
+        - ``model`: modified model
+
+    Make sure you have plenty of CPU memory available before you call this function. If you don't
+    have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it
+    conveniently placed for you in the checkpoint folder.
+
+    A typical usage might be ::
+
+        from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint
+        model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir)
+        # submit to model hub or save the model to share with others
+
+    Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context
+    of the same application. i.e. you will need to re-initialize the deepspeed engine, since
+    ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it.
+
+    """
+    logger.info("Extracting fp32 weights")
+    state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag)
+
+    logger.info("Overwriting model with fp32 weights")
+    model = model.cpu()
+    model.load_state_dict(state_dict, strict=False)
+
+    return model
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("checkpoint_dir",
+                        type=str,
+                        help="path to the desired checkpoint folder, e.g., path/checkpoint-12")
+    parser.add_argument("output_dir",
+                        type=str,
+                        help="directory to the pytorch fp32 state_dict output files"
+                        "(e.g. path/checkpoint-12-output/)")
+    parser.add_argument(
+        "--max_shard_size",
+        type=str,
+        default="5GB",
+        help="The maximum size for a checkpoint before being sharded. Checkpoints shard will then be each of size"
+        "lower than this size. If expressed as a string, needs to be digits followed by a unit (like `5MB`"
+        "We default it to 5GB in order for models to be able to run easily on free-tier google colab instances"
+        "without CPU OOM issues.")
+    parser.add_argument(
+        "--safe_serialization",
+        default=False,
+        action='store_true',
+        help="Whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).")
+    parser.add_argument("-t",
+                        "--tag",
+                        type=str,
+                        default=None,
+                        help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1")
+    parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters")
+    parser.add_argument("-d", "--debug", action='store_true', help="enable debug")
+    args = parser.parse_args()
+
+    debug = args.debug
+
+    convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir,
+                                               args.output_dir,
+                                               max_shard_size=args.max_shard_size,
+                                               safe_serialization=args.safe_serialization,
+                                               tag=args.tag,
+                                               exclude_frozen_parameters=args.exclude_frozen_parameters)
diff --git a/dapo_lora_plus_20251202_001141/checkpoint-256/README.md b/dapo_lora_plus_20251202_001141/checkpoint-256/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..b3fac4aca7a7fabb3a0972e6c9281e23853e2816
--- /dev/null
+++ b/dapo_lora_plus_20251202_001141/checkpoint-256/README.md
@@ -0,0 +1,209 @@
+---
+base_model: deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B
+- grpo
+- lora
+- transformers
+- trl
+---
+
+# Model Card for Model ID
+
+<!-- Provide a quick summary of what the model is/does. -->
+
+
+
+## Model Details
+
+### Model Description
+
+<!-- Provide a longer summary of what this model is. -->
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+<!-- Provide the basic links for the model. -->
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+
+### Direct Use
+
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+
+[More Information Needed]
+
+### Recommendations
+
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+
+[More Information Needed]
+
+### Training Procedure
+
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+
+#### Speeds, Sizes, Times [optional]
+
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+
+[More Information Needed]
+
+## Evaluation
+
+<!-- This section describes the evaluation protocols and provides the results. -->
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+<!-- This should link to a Dataset Card if possible. -->
+
+[More Information Needed]
+
+#### Factors
+
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+
+[More Information Needed]
+
+#### Metrics
+
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+<!-- Relevant interpretability work for the model goes here -->
+
+[More Information Needed]
+
+## Environmental Impact
+
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.17.1
\ No newline at end of file
diff --git a/dapo_lora_plus_20251202_001141/checkpoint-256/adapter_config.json b/dapo_lora_plus_20251202_001141/checkpoint-256/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..57b1340e85011632bb78b2fd3b13b455f6b0d622
--- /dev/null
+++ b/dapo_lora_plus_20251202_001141/checkpoint-256/adapter_config.json
@@ -0,0 +1,42 @@
+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B",
+  "bias": "none",
+  "corda_config": null,
+  "eva_config": null,
+  "exclude_modules": null,
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 64,
+  "lora_bias": false,
+  "lora_dropout": 0.05,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "qalora_group_size": 16,
+  "r": 32,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "q_proj",
+    "v_proj",
+    "k_proj",
+    "gate_proj",
+    "down_proj",
+    "up_proj",
+    "o_proj"
+  ],
+  "target_parameters": null,
+  "task_type": "CAUSAL_LM",
+  "trainable_token_indices": null,
+  "use_dora": false,
+  "use_qalora": false,
+  "use_rslora": false
+}
\ No newline at end of file
diff --git a/dapo_lora_plus_20251202_001141/checkpoint-256/chat_template.jinja b/dapo_lora_plus_20251202_001141/checkpoint-256/chat_template.jinja
new file mode 100644
index 0000000000000000000000000000000000000000..c2066bd7391c270626e39c9d7124f00360126412
--- /dev/null
+++ b/dapo_lora_plus_20251202_001141/checkpoint-256/chat_template.jinja
@@ -0,0 +1 @@
+{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% set ns = namespace(is_first=false, is_tool=false, is_output_first=true, system_prompt='') %}{%- for message in messages %}{%- if message['role'] == 'system' %}{% set ns.system_prompt = message['content'] %}{%- endif %}{%- endfor %}{{bos_token}}{{ns.system_prompt}}{%- for message in messages %}{%- if message['role'] == 'user' %}{%- set ns.is_tool = false -%}{{'<｜User｜>' + message['content']}}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is none %}{%- set ns.is_tool = false -%}{%- for tool in message['tool_calls']%}{%- if not ns.is_first %}{{'<｜Assistant｜><｜tool▁calls▁begin｜><｜tool▁call▁begin｜>' + tool['type'] + '<｜tool▁sep｜>' + tool['function']['name'] + '\n' + '```json' + '\n' + tool['function']['arguments'] + '\n' + '```' + '<｜tool▁call▁end｜>'}}{%- set ns.is_first = true -%}{%- else %}{{'\n' + '<｜tool▁call▁begin｜>' + tool['type'] + '<｜tool▁sep｜>' + tool['function']['name'] + '\n' + '```json' + '\n' + tool['function']['arguments'] + '\n' + '```' + '<｜tool▁call▁end｜>'}}{{'<｜tool▁calls▁end｜><｜end▁of▁sentence｜>'}}{%- endif %}{%- endfor %}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is not none %}{%- if ns.is_tool %}{{'<｜tool▁outputs▁end｜>' + message['content'] + '<｜end▁of▁sentence｜>'}}{%- set ns.is_tool = false -%}{%- else %}{% set content = message['content'] %}{% if '</think>' in content %}{% set content = content.split('</think>')[-1] %}{% endif %}{{'<｜Assistant｜>' + content + '<｜end▁of▁sentence｜>'}}{%- endif %}{%- endif %}{%- if message['role'] == 'tool' %}{%- set ns.is_tool = true -%}{%- if ns.is_output_first %}{{'<｜tool▁outputs▁begin｜><｜tool▁output▁begin｜>' + message['content'] + '<｜tool▁output▁end｜>'}}{%- set ns.is_output_first = false %}{%- else %}{{'\n<｜tool▁output▁begin｜>' + message['content'] + '<｜tool▁output▁end｜>'}}{%- endif %}{%- endif %}{%- endfor -%}{% if ns.is_tool %}{{'<｜tool▁outputs▁end｜>'}}{% endif %}{% if add_generation_prompt and not ns.is_tool %}{{'<｜Assistant｜><think>\n'}}{% endif %}
\ No newline at end of file
diff --git a/dapo_lora_plus_20251202_001141/checkpoint-256/latest b/dapo_lora_plus_20251202_001141/checkpoint-256/latest
new file mode 100644
index 0000000000000000000000000000000000000000..b747f9725067064e241a7a3bed90583971af8ad1
--- /dev/null
+++ b/dapo_lora_plus_20251202_001141/checkpoint-256/latest
@@ -0,0 +1 @@
+global_step256
\ No newline at end of file
diff --git a/dapo_lora_plus_20251202_001141/checkpoint-256/special_tokens_map.json b/dapo_lora_plus_20251202_001141/checkpoint-256/special_tokens_map.json
new file mode 100644
index 0000000000000000000000000000000000000000..1d385d62cf08bca35254547902b792c243656ec1
--- /dev/null
+++ b/dapo_lora_plus_20251202_001141/checkpoint-256/special_tokens_map.json
@@ -0,0 +1,23 @@
+{
+  "bos_token": {
+    "content": "<｜begin▁of▁sentence｜>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "<｜end▁of▁sentence｜>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<｜end▁of▁sentence｜>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}
diff --git a/dapo_lora_plus_20251202_001141/checkpoint-256/tokenizer_config.json b/dapo_lora_plus_20251202_001141/checkpoint-256/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..d252dd4e5764106823080946500c02a8ed8c90c9
--- /dev/null
+++ b/dapo_lora_plus_20251202_001141/checkpoint-256/tokenizer_config.json
@@ -0,0 +1,194 @@
+{
+  "add_bos_token": true,
+  "add_eos_token": false,
+  "add_prefix_space": null,
+  "added_tokens_decoder": {
+    "151643": {
+      "content": "<｜end▁of▁sentence｜>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151644": {
+      "content": "<｜User｜>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151645": {
+      "content": "<｜Assistant｜>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151646": {
+      "content": "<｜begin▁of▁sentence｜>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151647": {
+      "content": "<|EOT|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151648": {
+      "content": "<think>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151649": {
+      "content": "</think>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151650": {
+      "content": "<|quad_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151651": {
+      "content": "<|quad_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151652": {
+      "content": "<|vision_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151653": {
+      "content": "<|vision_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151654": {
+      "content": "<|vision_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151655": {
+      "content": "<|image_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151656": {
+      "content": "<|video_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151657": {
+      "content": "<tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151658": {
+      "content": "</tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151659": {
+      "content": "<|fim_prefix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151660": {
+      "content": "<|fim_middle|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151661": {
+      "content": "<|fim_suffix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151662": {
+      "content": "<|fim_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151663": {
+      "content": "<|repo_name|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151664": {
+      "content": "<|file_sep|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    }
+  },
+  "bos_token": "<｜begin▁of▁sentence｜>",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<｜end▁of▁sentence｜>",
+  "extra_special_tokens": {},
+  "legacy": true,
+  "model_max_length": 16384,
+  "pad_token": "<｜end▁of▁sentence｜>",
+  "sp_model_kwargs": {},
+  "tokenizer_class": "LlamaTokenizerFast",
+  "unk_token": null,
+  "use_default_system_prompt": false
+}
diff --git a/dapo_lora_plus_20251202_001141/checkpoint-256/trainer_state.json b/dapo_lora_plus_20251202_001141/checkpoint-256/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..ea6a125e6f21394d9d572856f65dd117d5ebc999
--- /dev/null
+++ b/dapo_lora_plus_20251202_001141/checkpoint-256/trainer_state.json
@@ -0,0 +1,7970 @@
+{
+  "best_global_step": null,
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 0.23551057957681693,
+  "eval_steps": 500,
+  "global_step": 256,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15689.0,
+      "completions/max_terminated_length": 15689.0,
+      "completions/mean_length": 6039.171875,
+      "completions/mean_terminated_length": 6039.171875,
+      "completions/min_length": 250.0,
+      "completions/min_terminated_length": 250.0,
+      "entropy": 1.19118632376194,
+      "epoch": 0.0009199632014719411,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0025745572056621313,
+      "learning_rate": 1e-05,
+      "loss": 0.0591,
+      "num_tokens": 792270.0,
+      "reward": 0.25,
+      "reward_std": 0.24435341358184814,
+      "rewards/accuracy_reward/mean": 0.25,
+      "rewards/accuracy_reward/std": 0.434714138507843,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999940395355225,
+      "sampling/importance_sampling_ratio/min": 0.0002457273658365011,
+      "sampling/sampling_logp_difference/max": 8.311287879943848,
+      "sampling/sampling_logp_difference/mean": 0.021642697975039482,
+      "step": 1
+    },
+    {
+      "clip_ratio/high_max": 5.499582130141789e-06,
+      "clip_ratio/high_mean": 1.3748955325354473e-06,
+      "clip_ratio/low_mean": 2.871888784738985e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.009378326623846e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 16292.0,
+      "completions/max_terminated_length": 16292.0,
+      "completions/mean_length": 4767.1875,
+      "completions/mean_terminated_length": 4767.1875,
+      "completions/min_length": 556.0,
+      "completions/min_terminated_length": 556.0,
+      "entropy": 1.088237851858139,
+      "epoch": 0.0018399264029438822,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.002068034838885069,
+      "learning_rate": 1e-05,
+      "loss": 0.0258,
+      "num_tokens": 1425798.0,
+      "reward": 0.3046875,
+      "reward_std": 0.29036980867385864,
+      "rewards/accuracy_reward/mean": 0.3046875,
+      "rewards/accuracy_reward/std": 0.46208351850509644,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999016523361206,
+      "sampling/importance_sampling_ratio/min": 0.01811397261917591,
+      "sampling/sampling_logp_difference/max": 4.011071681976318,
+      "sampling/sampling_logp_difference/mean": 0.01877593621611595,
+      "step": 2
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 4.459846724103045e-05,
+      "clip_ratio/low_min": 3.4060874440910993e-06,
+      "clip_ratio/region_mean": 4.459846724103045e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16317.0,
+      "completions/mean_length": 6586.359375,
+      "completions/mean_terminated_length": 6351.21630859375,
+      "completions/min_length": 4.0,
+      "completions/min_terminated_length": 4.0,
+      "entropy": 1.0497623533010483,
+      "epoch": 0.0027598896044158236,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.001971944235265255,
+      "learning_rate": 1e-05,
+      "loss": 0.0199,
+      "num_tokens": 2287420.0,
+      "reward": 0.28125,
+      "reward_std": 0.29143062233924866,
+      "rewards/accuracy_reward/mean": 0.28125,
+      "rewards/accuracy_reward/std": 0.4513758420944214,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999316334724426,
+      "sampling/importance_sampling_ratio/min": 5.356698966352269e-05,
+      "sampling/sampling_logp_difference/max": 9.834577560424805,
+      "sampling/sampling_logp_difference/mean": 0.02137824520468712,
+      "step": 3
+    },
+    {
+      "clip_ratio/high_max": 1.7640652004047297e-05,
+      "clip_ratio/high_mean": 5.48578327652649e-06,
+      "clip_ratio/low_mean": 3.218628648937738e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.767206976590387e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 14690.0,
+      "completions/max_terminated_length": 14690.0,
+      "completions/mean_length": 5448.0234375,
+      "completions/mean_terminated_length": 5448.0234375,
+      "completions/min_length": 707.0,
+      "completions/min_terminated_length": 707.0,
+      "entropy": 1.1134418621659279,
+      "epoch": 0.0036798528058877645,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0016465173102915287,
+      "learning_rate": 1e-05,
+      "loss": 0.0433,
+      "num_tokens": 3009167.0,
+      "reward": 0.2890625,
+      "reward_std": 0.27958330512046814,
+      "rewards/accuracy_reward/mean": 0.2890625,
+      "rewards/accuracy_reward/std": 0.45510825514793396,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999774694442749,
+      "sampling/importance_sampling_ratio/min": 7.889385415182915e-06,
+      "sampling/sampling_logp_difference/max": 11.749992370605469,
+      "sampling/sampling_logp_difference/mean": 0.020580951124429703,
+      "step": 4
+    },
+    {
+      "clip_ratio/high_max": 1.3439519989333348e-05,
+      "clip_ratio/high_mean": 3.359879997333337e-06,
+      "clip_ratio/low_mean": 2.8849915906903334e-05,
+      "clip_ratio/low_min": 8.467687621305231e-06,
+      "clip_ratio/region_mean": 3.220979442630778e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 13420.0,
+      "completions/mean_length": 5436.8671875,
+      "completions/mean_terminated_length": 5350.66943359375,
+      "completions/min_length": 26.0,
+      "completions/min_terminated_length": 26.0,
+      "entropy": 1.1473859176039696,
+      "epoch": 0.004599816007359705,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0023770295083522797,
+      "learning_rate": 1e-05,
+      "loss": 0.0153,
+      "num_tokens": 3725654.0,
+      "reward": 0.2734375,
+      "reward_std": 0.27434611320495605,
+      "rewards/accuracy_reward/mean": 0.2734375,
+      "rewards/accuracy_reward/std": 0.447474867105484,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.99991774559021,
+      "sampling/importance_sampling_ratio/min": 0.0011146117467433214,
+      "sampling/sampling_logp_difference/max": 6.799249172210693,
+      "sampling/sampling_logp_difference/mean": 0.020377254113554955,
+      "step": 5
+    },
+    {
+      "clip_ratio/high_max": 4.652201369026443e-06,
+      "clip_ratio/high_mean": 1.1630503422566107e-06,
+      "clip_ratio/low_mean": 2.8399212624208303e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.9562263534899103e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 14440.0,
+      "completions/max_terminated_length": 14440.0,
+      "completions/mean_length": 4697.5390625,
+      "completions/mean_terminated_length": 4697.5390625,
+      "completions/min_length": 445.0,
+      "completions/min_terminated_length": 445.0,
+      "entropy": 1.0097229778766632,
+      "epoch": 0.005519779208831647,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.003342699259519577,
+      "learning_rate": 1e-05,
+      "loss": 0.0326,
+      "num_tokens": 4345547.0,
+      "reward": 0.390625,
+      "reward_std": 0.34480881690979004,
+      "rewards/accuracy_reward/mean": 0.390625,
+      "rewards/accuracy_reward/std": 0.4898075461387634,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999914765357971,
+      "sampling/importance_sampling_ratio/min": 0.002385853324085474,
+      "sampling/sampling_logp_difference/max": 6.038198471069336,
+      "sampling/sampling_logp_difference/mean": 0.0185473021119833,
+      "step": 6
+    },
+    {
+      "clip_ratio/high_max": 9.362594937556423e-06,
+      "clip_ratio/high_mean": 2.340648734389106e-06,
+      "clip_ratio/low_mean": 6.054362825125281e-05,
+      "clip_ratio/low_min": 7.427356649714056e-06,
+      "clip_ratio/region_mean": 6.288427744038927e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14652.0,
+      "completions/mean_length": 6218.2109375,
+      "completions/mean_terminated_length": 5890.2822265625,
+      "completions/min_length": 156.0,
+      "completions/min_terminated_length": 156.0,
+      "entropy": 1.0579778030514717,
+      "epoch": 0.006439742410303588,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.002073560608550906,
+      "learning_rate": 1e-05,
+      "loss": 0.0201,
+      "num_tokens": 5160646.0,
+      "reward": 0.2109375,
+      "reward_std": 0.27222445607185364,
+      "rewards/accuracy_reward/mean": 0.2109375,
+      "rewards/accuracy_reward/std": 0.4095771610736847,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999560117721558,
+      "sampling/importance_sampling_ratio/min": 0.00044544730917550623,
+      "sampling/sampling_logp_difference/max": 7.716431617736816,
+      "sampling/sampling_logp_difference/mean": 0.020321575924754143,
+      "step": 7
+    },
+    {
+      "clip_ratio/high_max": 1.1064067621191498e-05,
+      "clip_ratio/high_mean": 2.7660169052978745e-06,
+      "clip_ratio/low_mean": 2.2175867059104348e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.4941883737028547e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 13637.0,
+      "completions/mean_length": 5127.8359375,
+      "completions/mean_terminated_length": 5039.20458984375,
+      "completions/min_length": 556.0,
+      "completions/min_terminated_length": 556.0,
+      "entropy": 1.0472618415951729,
+      "epoch": 0.007359705611775529,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0032994600478559732,
+      "learning_rate": 1e-05,
+      "loss": 0.0751,
+      "num_tokens": 5836289.0,
+      "reward": 0.3359375,
+      "reward_std": 0.2948455810546875,
+      "rewards/accuracy_reward/mean": 0.3359375,
+      "rewards/accuracy_reward/std": 0.47417303919792175,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999483227729797,
+      "sampling/importance_sampling_ratio/min": 0.0013780994340777397,
+      "sampling/sampling_logp_difference/max": 6.587049961090088,
+      "sampling/sampling_logp_difference/mean": 0.01940803974866867,
+      "step": 8
+    },
+    {
+      "clip_ratio/high_max": 1.2357884770608507e-05,
+      "clip_ratio/high_mean": 3.0894711926521268e-06,
+      "clip_ratio/low_mean": 3.000627111759968e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.309574231025181e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15916.0,
+      "completions/mean_length": 4516.890625,
+      "completions/mean_terminated_length": 4423.44873046875,
+      "completions/min_length": 238.0,
+      "completions/min_terminated_length": 238.0,
+      "entropy": 0.911251038312912,
+      "epoch": 0.00827966881324747,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.003016560571268201,
+      "learning_rate": 1e-05,
+      "loss": 0.1006,
+      "num_tokens": 6433171.0,
+      "reward": 0.390625,
+      "reward_std": 0.3066929578781128,
+      "rewards/accuracy_reward/mean": 0.390625,
+      "rewards/accuracy_reward/std": 0.4898075461387634,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999179840087891,
+      "sampling/importance_sampling_ratio/min": 0.005480794236063957,
+      "sampling/sampling_logp_difference/max": 5.206505298614502,
+      "sampling/sampling_logp_difference/mean": 0.017437148839235306,
+      "step": 9
+    },
+    {
+      "clip_ratio/high_max": 4.6329013457580004e-05,
+      "clip_ratio/high_mean": 1.1582253364395001e-05,
+      "clip_ratio/low_mean": 7.069455705277505e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 8.227681109929108e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 13970.0,
+      "completions/mean_length": 4961.453125,
+      "completions/mean_terminated_length": 4687.31201171875,
+      "completions/min_length": 483.0,
+      "completions/min_terminated_length": 483.0,
+      "entropy": 0.6808596402406693,
+      "epoch": 0.00919963201471941,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.0035386616364121437,
+      "learning_rate": 1e-05,
+      "loss": 0.0596,
+      "num_tokens": 7085389.0,
+      "reward": 0.5625,
+      "reward_std": 0.3816363215446472,
+      "rewards/accuracy_reward/mean": 0.5625,
+      "rewards/accuracy_reward/std": 0.49802759289741516,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999173879623413,
+      "sampling/importance_sampling_ratio/min": 0.0002734088629949838,
+      "sampling/sampling_logp_difference/max": 8.20454216003418,
+      "sampling/sampling_logp_difference/mean": 0.01566406339406967,
+      "step": 10
+    },
+    {
+      "clip_ratio/high_max": 2.43190661421977e-05,
+      "clip_ratio/high_mean": 6.079766535549425e-06,
+      "clip_ratio/low_mean": 2.2395396172214532e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.8475162707763957e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14776.0,
+      "completions/mean_length": 4429.40625,
+      "completions/mean_terminated_length": 4335.275390625,
+      "completions/min_length": 25.0,
+      "completions/min_terminated_length": 25.0,
+      "entropy": 0.9181502386927605,
+      "epoch": 0.010119595216191352,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.0022535293828696012,
+      "learning_rate": 1e-05,
+      "loss": 0.0031,
+      "num_tokens": 7672185.0,
+      "reward": 0.3671875,
+      "reward_std": 0.20357418060302734,
+      "rewards/accuracy_reward/mean": 0.3671875,
+      "rewards/accuracy_reward/std": 0.4839322865009308,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998801946640015,
+      "sampling/importance_sampling_ratio/min": 5.315856554943821e-08,
+      "sampling/sampling_logp_difference/max": 16.74998664855957,
+      "sampling/sampling_logp_difference/mean": 0.018429335206747055,
+      "step": 11
+    },
+    {
+      "clip_ratio/high_max": 1.0117325928149512e-05,
+      "clip_ratio/high_mean": 2.529331482037378e-06,
+      "clip_ratio/low_mean": 1.1982813475697185e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 1.45121450714214e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14029.0,
+      "completions/mean_length": 5282.6796875,
+      "completions/mean_terminated_length": 5106.46875,
+      "completions/min_length": 323.0,
+      "completions/min_terminated_length": 323.0,
+      "entropy": 1.113751620054245,
+      "epoch": 0.011039558417663294,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0013591813622042537,
+      "learning_rate": 1e-05,
+      "loss": 0.0971,
+      "num_tokens": 8369000.0,
+      "reward": 0.3984375,
+      "reward_std": 0.3029736578464508,
+      "rewards/accuracy_reward/mean": 0.3984375,
+      "rewards/accuracy_reward/std": 0.4915000796318054,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998897314071655,
+      "sampling/importance_sampling_ratio/min": 3.970265970565379e-05,
+      "sampling/sampling_logp_difference/max": 10.134092330932617,
+      "sampling/sampling_logp_difference/mean": 0.020221836864948273,
+      "step": 12
+    },
+    {
+      "clip_ratio/high_max": 5.411958227341529e-06,
+      "clip_ratio/high_mean": 1.3529895568353822e-06,
+      "clip_ratio/low_mean": 2.5284593846208736e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.6637583516730956e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15925.0,
+      "completions/mean_length": 6970.421875,
+      "completions/mean_terminated_length": 6744.49609375,
+      "completions/min_length": 53.0,
+      "completions/min_terminated_length": 53.0,
+      "entropy": 1.1721933633089066,
+      "epoch": 0.011959521619135235,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.0024079051800072193,
+      "learning_rate": 1e-05,
+      "loss": 0.0713,
+      "num_tokens": 9283182.0,
+      "reward": 0.171875,
+      "reward_std": 0.17965975403785706,
+      "rewards/accuracy_reward/mean": 0.171875,
+      "rewards/accuracy_reward/std": 0.3787541687488556,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999163746833801,
+      "sampling/importance_sampling_ratio/min": 0.0008915197686292231,
+      "sampling/sampling_logp_difference/max": 7.0225830078125,
+      "sampling/sampling_logp_difference/mean": 0.021462474018335342,
+      "step": 13
+    },
+    {
+      "clip_ratio/high_max": 2.0661535927501973e-05,
+      "clip_ratio/high_mean": 5.165383981875493e-06,
+      "clip_ratio/low_mean": 2.4304956298237812e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.947033948430544e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 14658.0,
+      "completions/max_terminated_length": 14658.0,
+      "completions/mean_length": 4886.875,
+      "completions/mean_terminated_length": 4886.875,
+      "completions/min_length": 434.0,
+      "completions/min_terminated_length": 434.0,
+      "entropy": 1.0108910650014877,
+      "epoch": 0.012879484820607176,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.002063734456896782,
+      "learning_rate": 1e-05,
+      "loss": 0.0386,
+      "num_tokens": 9928446.0,
+      "reward": 0.3515625,
+      "reward_std": 0.2409384697675705,
+      "rewards/accuracy_reward/mean": 0.3515625,
+      "rewards/accuracy_reward/std": 0.4793342351913452,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000026226043701,
+      "sampling/importance_sampling_ratio/min": 0.0003672837920021266,
+      "sampling/sampling_logp_difference/max": 7.9093756675720215,
+      "sampling/sampling_logp_difference/mean": 0.01918785460293293,
+      "step": 14
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 4.4761846993424115e-06,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.4761846993424115e-06,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 12992.0,
+      "completions/max_terminated_length": 12992.0,
+      "completions/mean_length": 4824.0078125,
+      "completions/mean_terminated_length": 4824.0078125,
+      "completions/min_length": 103.0,
+      "completions/min_terminated_length": 103.0,
+      "entropy": 1.1070282831788063,
+      "epoch": 0.013799448022079117,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.002424790756776929,
+      "learning_rate": 1e-05,
+      "loss": 0.0485,
+      "num_tokens": 10566415.0,
+      "reward": 0.28125,
+      "reward_std": 0.23698672652244568,
+      "rewards/accuracy_reward/mean": 0.28125,
+      "rewards/accuracy_reward/std": 0.4513758420944214,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000125169754028,
+      "sampling/importance_sampling_ratio/min": 0.0011708867968991399,
+      "sampling/sampling_logp_difference/max": 6.749993801116943,
+      "sampling/sampling_logp_difference/mean": 0.02069389820098877,
+      "step": 15
+    },
+    {
+      "clip_ratio/high_max": 3.5075904634140898e-06,
+      "clip_ratio/high_mean": 8.768976158535224e-07,
+      "clip_ratio/low_mean": 2.2676964135825983e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.3553861751679506e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 12685.0,
+      "completions/mean_length": 5449.4140625,
+      "completions/mean_terminated_length": 5363.31494140625,
+      "completions/min_length": 23.0,
+      "completions/min_terminated_length": 23.0,
+      "entropy": 0.9817888736724854,
+      "epoch": 0.014719411223551058,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0021046048495918512,
+      "learning_rate": 1e-05,
+      "loss": 0.0252,
+      "num_tokens": 11281908.0,
+      "reward": 0.2265625,
+      "reward_std": 0.27168765664100647,
+      "rewards/accuracy_reward/mean": 0.2265625,
+      "rewards/accuracy_reward/std": 0.4202519655227661,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999805688858032,
+      "sampling/importance_sampling_ratio/min": 0.013273254036903381,
+      "sampling/sampling_logp_difference/max": 4.322004318237305,
+      "sampling/sampling_logp_difference/mean": 0.019556276500225067,
+      "step": 16
+    },
+    {
+      "clip_ratio/high_max": 1.624216065465589e-05,
+      "clip_ratio/high_mean": 4.060540163663973e-06,
+      "clip_ratio/low_mean": 5.4349347919924185e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.840988796990132e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 14133.0,
+      "completions/max_terminated_length": 14133.0,
+      "completions/mean_length": 5343.25,
+      "completions/mean_terminated_length": 5343.25,
+      "completions/min_length": 324.0,
+      "completions/min_terminated_length": 324.0,
+      "entropy": 1.04741720110178,
+      "epoch": 0.015639374425023,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0035894038155674934,
+      "learning_rate": 1e-05,
+      "loss": 0.0584,
+      "num_tokens": 11987692.0,
+      "reward": 0.3359375,
+      "reward_std": 0.3124620020389557,
+      "rewards/accuracy_reward/mean": 0.3359375,
+      "rewards/accuracy_reward/std": 0.47417303919792175,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998996257781982,
+      "sampling/importance_sampling_ratio/min": 2.1446165192173794e-05,
+      "sampling/sampling_logp_difference/max": 10.749964714050293,
+      "sampling/sampling_logp_difference/mean": 0.020530637353658676,
+      "step": 17
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 4.272115029380075e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.272115029380075e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15138.0,
+      "completions/mean_length": 6301.9375,
+      "completions/mean_terminated_length": 5806.09814453125,
+      "completions/min_length": 72.0,
+      "completions/min_terminated_length": 72.0,
+      "entropy": 0.8892941772937775,
+      "epoch": 0.01655933762649494,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0032246762420982122,
+      "learning_rate": 1e-05,
+      "loss": 0.0811,
+      "num_tokens": 12814244.0,
+      "reward": 0.3125,
+      "reward_std": 0.3606000542640686,
+      "rewards/accuracy_reward/mean": 0.3125,
+      "rewards/accuracy_reward/std": 0.4653336703777313,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999184608459473,
+      "sampling/importance_sampling_ratio/min": 0.021351110190153122,
+      "sampling/sampling_logp_difference/max": 3.846651554107666,
+      "sampling/sampling_logp_difference/mean": 0.017541853711009026,
+      "step": 18
+    },
+    {
+      "clip_ratio/high_max": 9.956602298188955e-06,
+      "clip_ratio/high_mean": 2.4891505745472386e-06,
+      "clip_ratio/low_mean": 2.772165316855535e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.0210803743102588e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 16213.0,
+      "completions/max_terminated_length": 16213.0,
+      "completions/mean_length": 5297.46875,
+      "completions/mean_terminated_length": 5297.46875,
+      "completions/min_length": 2.0,
+      "completions/min_terminated_length": 2.0,
+      "entropy": 0.8097029253840446,
+      "epoch": 0.017479300827966882,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0023969109170138836,
+      "learning_rate": 1e-05,
+      "loss": -0.0153,
+      "num_tokens": 13512520.0,
+      "reward": 0.359375,
+      "reward_std": 0.248829185962677,
+      "rewards/accuracy_reward/mean": 0.359375,
+      "rewards/accuracy_reward/std": 0.481702595949173,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999222159385681,
+      "sampling/importance_sampling_ratio/min": 0.005766105372458696,
+      "sampling/sampling_logp_difference/max": 5.155758380889893,
+      "sampling/sampling_logp_difference/mean": 0.017464376986026764,
+      "step": 19
+    },
+    {
+      "clip_ratio/high_max": 1.0098337497765897e-05,
+      "clip_ratio/high_mean": 2.524584374441474e-06,
+      "clip_ratio/low_mean": 3.173396362399217e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.425854845318099e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14655.0,
+      "completions/mean_length": 4890.34375,
+      "completions/mean_terminated_length": 4799.84228515625,
+      "completions/min_length": 68.0,
+      "completions/min_terminated_length": 68.0,
+      "entropy": 0.9267145916819572,
+      "epoch": 0.01839926402943882,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.002759338356554508,
+      "learning_rate": 1e-05,
+      "loss": -0.0014,
+      "num_tokens": 14155556.0,
+      "reward": 0.3515625,
+      "reward_std": 0.31010788679122925,
+      "rewards/accuracy_reward/mean": 0.3515625,
+      "rewards/accuracy_reward/std": 0.4793342351913452,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999570250511169,
+      "sampling/importance_sampling_ratio/min": 0.008491010405123234,
+      "sampling/sampling_logp_difference/max": 4.768747329711914,
+      "sampling/sampling_logp_difference/mean": 0.018839433789253235,
+      "step": 20
+    },
+    {
+      "clip_ratio/high_max": 7.532389190600952e-06,
+      "clip_ratio/high_mean": 1.883097297650238e-06,
+      "clip_ratio/low_mean": 1.9051809317716106e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.0934906729053182e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 16296.0,
+      "completions/max_terminated_length": 16296.0,
+      "completions/mean_length": 4609.40625,
+      "completions/mean_terminated_length": 4609.40625,
+      "completions/min_length": 461.0,
+      "completions/min_terminated_length": 461.0,
+      "entropy": 1.171089917421341,
+      "epoch": 0.019319227230910764,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0021055075339972973,
+      "learning_rate": 1e-05,
+      "loss": -0.0051,
+      "num_tokens": 14765328.0,
+      "reward": 0.2421875,
+      "reward_std": 0.2409384548664093,
+      "rewards/accuracy_reward/mean": 0.2421875,
+      "rewards/accuracy_reward/std": 0.4300905168056488,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999741911888123,
+      "sampling/importance_sampling_ratio/min": 5.368983693188056e-07,
+      "sampling/sampling_logp_difference/max": 14.437457084655762,
+      "sampling/sampling_logp_difference/mean": 0.020226795226335526,
+      "step": 21
+    },
+    {
+      "clip_ratio/high_max": 1.7169573766295798e-05,
+      "clip_ratio/high_mean": 4.2923934415739495e-06,
+      "clip_ratio/low_mean": 5.869748633813288e-06,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 1.0162142189074075e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14299.0,
+      "completions/mean_length": 5099.0390625,
+      "completions/mean_terminated_length": 5010.18115234375,
+      "completions/min_length": 539.0,
+      "completions/min_terminated_length": 539.0,
+      "entropy": 1.005959376692772,
+      "epoch": 0.020239190432382703,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0027595218271017075,
+      "learning_rate": 1e-05,
+      "loss": 0.0236,
+      "num_tokens": 15438549.0,
+      "reward": 0.296875,
+      "reward_std": 0.20069602131843567,
+      "rewards/accuracy_reward/mean": 0.296875,
+      "rewards/accuracy_reward/std": 0.45867621898651123,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999887347221375,
+      "sampling/importance_sampling_ratio/min": 0.00013984869292471558,
+      "sampling/sampling_logp_difference/max": 8.87494945526123,
+      "sampling/sampling_logp_difference/mean": 0.01902824640274048,
+      "step": 22
+    },
+    {
+      "clip_ratio/high_max": 5.162942670722259e-06,
+      "clip_ratio/high_mean": 1.2907356676805648e-06,
+      "clip_ratio/low_mean": 3.6872071063953626e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.816280593582633e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16204.0,
+      "completions/mean_length": 7138.0390625,
+      "completions/mean_terminated_length": 6839.7822265625,
+      "completions/min_length": 729.0,
+      "completions/min_terminated_length": 729.0,
+      "entropy": 1.0403362140059471,
+      "epoch": 0.021159153633854646,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.002748022088780999,
+      "learning_rate": 1e-05,
+      "loss": 0.0647,
+      "num_tokens": 16373898.0,
+      "reward": 0.296875,
+      "reward_std": 0.3169426918029785,
+      "rewards/accuracy_reward/mean": 0.296875,
+      "rewards/accuracy_reward/std": 0.45867621898651123,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999048709869385,
+      "sampling/importance_sampling_ratio/min": 0.0003802926803473383,
+      "sampling/sampling_logp_difference/max": 7.874569416046143,
+      "sampling/sampling_logp_difference/mean": 0.020853528752923012,
+      "step": 23
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 5.6506045439164154e-05,
+      "clip_ratio/low_min": 5.709326615033206e-06,
+      "clip_ratio/region_mean": 5.6506045439164154e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14543.0,
+      "completions/mean_length": 5420.515625,
+      "completions/mean_terminated_length": 5334.18896484375,
+      "completions/min_length": 19.0,
+      "completions/min_terminated_length": 19.0,
+      "entropy": 1.1339883506298065,
+      "epoch": 0.02207911683532659,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0029502976685762405,
+      "learning_rate": 1e-05,
+      "loss": 0.0756,
+      "num_tokens": 17088156.0,
+      "reward": 0.1953125,
+      "reward_std": 0.25620076060295105,
+      "rewards/accuracy_reward/mean": 0.1953125,
+      "rewards/accuracy_reward/std": 0.3979988098144531,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999445676803589,
+      "sampling/importance_sampling_ratio/min": 9.70982582657598e-05,
+      "sampling/sampling_logp_difference/max": 9.239787101745605,
+      "sampling/sampling_logp_difference/mean": 0.0199423898011446,
+      "step": 24
+    },
+    {
+      "clip_ratio/high_max": 5.619998319161823e-06,
+      "clip_ratio/high_mean": 1.4049995797904558e-06,
+      "clip_ratio/low_mean": 6.439320418394345e-05,
+      "clip_ratio/low_min": 4.70632539872895e-06,
+      "clip_ratio/region_mean": 6.57982034226734e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14636.0,
+      "completions/mean_length": 5116.3046875,
+      "completions/mean_terminated_length": 4845.88037109375,
+      "completions/min_length": 10.0,
+      "completions/min_terminated_length": 10.0,
+      "entropy": 0.9503882825374603,
+      "epoch": 0.022999080036798528,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.004891107324510813,
+      "learning_rate": 1e-05,
+      "loss": 0.0522,
+      "num_tokens": 17766619.0,
+      "reward": 0.3203125,
+      "reward_std": 0.3366856575012207,
+      "rewards/accuracy_reward/mean": 0.3203125,
+      "rewards/accuracy_reward/std": 0.4684300124645233,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999970018863678,
+      "sampling/importance_sampling_ratio/min": 0.0010618992382660508,
+      "sampling/sampling_logp_difference/max": 6.847696304321289,
+      "sampling/sampling_logp_difference/mean": 0.01914183795452118,
+      "step": 25
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 3.839018643247982e-05,
+      "clip_ratio/low_min": 4.115091087442124e-06,
+      "clip_ratio/region_mean": 3.839018643247982e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14634.0,
+      "completions/mean_length": 5061.8671875,
+      "completions/mean_terminated_length": 4972.71630859375,
+      "completions/min_length": 281.0,
+      "completions/min_terminated_length": 281.0,
+      "entropy": 1.0540335327386856,
+      "epoch": 0.02391904323827047,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0030373274348676205,
+      "learning_rate": 1e-05,
+      "loss": 0.0246,
+      "num_tokens": 18432938.0,
+      "reward": 0.34375,
+      "reward_std": 0.28118088841438293,
+      "rewards/accuracy_reward/mean": 0.34375,
+      "rewards/accuracy_reward/std": 0.47682511806488037,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999624490737915,
+      "sampling/importance_sampling_ratio/min": 1.7212972807101323e-06,
+      "sampling/sampling_logp_difference/max": 13.272432327270508,
+      "sampling/sampling_logp_difference/mean": 0.019548218697309494,
+      "step": 26
+    },
+    {
+      "clip_ratio/high_max": 1.4656657867817557e-05,
+      "clip_ratio/high_mean": 4.665093399580655e-06,
+      "clip_ratio/low_mean": 3.751162262233265e-05,
+      "clip_ratio/low_min": 4.413062470121076e-06,
+      "clip_ratio/region_mean": 4.2176716192443564e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15782.0,
+      "completions/max_terminated_length": 15782.0,
+      "completions/mean_length": 6349.9765625,
+      "completions/mean_terminated_length": 6349.9765625,
+      "completions/min_length": 2.0,
+      "completions/min_terminated_length": 2.0,
+      "entropy": 1.0268081277608871,
+      "epoch": 0.02483900643974241,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0017623496241867542,
+      "learning_rate": 1e-05,
+      "loss": 0.0011,
+      "num_tokens": 19264743.0,
+      "reward": 0.2734375,
+      "reward_std": 0.33903974294662476,
+      "rewards/accuracy_reward/mean": 0.2734375,
+      "rewards/accuracy_reward/std": 0.447474867105484,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000061988830566,
+      "sampling/importance_sampling_ratio/min": 6.870362267363816e-05,
+      "sampling/sampling_logp_difference/max": 9.585708618164062,
+      "sampling/sampling_logp_difference/mean": 0.019106190651655197,
+      "step": 27
+    },
+    {
+      "clip_ratio/high_max": 9.221375876222737e-06,
+      "clip_ratio/high_mean": 2.3053439690556843e-06,
+      "clip_ratio/low_mean": 3.09787185415189e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.328406273794826e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15944.0,
+      "completions/mean_length": 5815.484375,
+      "completions/mean_terminated_length": 5561.84033203125,
+      "completions/min_length": 607.0,
+      "completions/min_terminated_length": 607.0,
+      "entropy": 1.0389493256807327,
+      "epoch": 0.025758969641214352,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.003111837198957801,
+      "learning_rate": 1e-05,
+      "loss": -0.0162,
+      "num_tokens": 20030109.0,
+      "reward": 0.34375,
+      "reward_std": 0.32719242572784424,
+      "rewards/accuracy_reward/mean": 0.34375,
+      "rewards/accuracy_reward/std": 0.47682511806488037,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000298023223877,
+      "sampling/importance_sampling_ratio/min": 0.02987043187022209,
+      "sampling/sampling_logp_difference/max": 3.5108861923217773,
+      "sampling/sampling_logp_difference/mean": 0.020060991868376732,
+      "step": 28
+    },
+    {
+      "clip_ratio/high_max": 6.7810142354574054e-06,
+      "clip_ratio/high_mean": 1.6952535588643514e-06,
+      "clip_ratio/low_mean": 4.474762545214617e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.644287901101052e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15371.0,
+      "completions/mean_length": 5157.1484375,
+      "completions/mean_terminated_length": 5068.748046875,
+      "completions/min_length": 11.0,
+      "completions/min_terminated_length": 11.0,
+      "entropy": 1.0510126948356628,
+      "epoch": 0.02667893284268629,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.003041633637621999,
+      "learning_rate": 1e-05,
+      "loss": 0.0471,
+      "num_tokens": 20710904.0,
+      "reward": 0.3125,
+      "reward_std": 0.35612428188323975,
+      "rewards/accuracy_reward/mean": 0.3125,
+      "rewards/accuracy_reward/std": 0.4653336703777313,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999587535858154,
+      "sampling/importance_sampling_ratio/min": 0.04357198625802994,
+      "sampling/sampling_logp_difference/max": 3.133340835571289,
+      "sampling/sampling_logp_difference/mean": 0.019007597118616104,
+      "step": 29
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 2.0962848566341563e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.0962848566341563e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15333.0,
+      "completions/max_terminated_length": 15333.0,
+      "completions/mean_length": 4446.3828125,
+      "completions/mean_terminated_length": 4446.3828125,
+      "completions/min_length": 73.0,
+      "completions/min_terminated_length": 73.0,
+      "entropy": 1.053279548883438,
+      "epoch": 0.027598896044158234,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0022369560319930315,
+      "learning_rate": 1e-05,
+      "loss": -0.001,
+      "num_tokens": 21298497.0,
+      "reward": 0.390625,
+      "reward_std": 0.24169495701789856,
+      "rewards/accuracy_reward/mean": 0.390625,
+      "rewards/accuracy_reward/std": 0.4898075461387634,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998750686645508,
+      "sampling/importance_sampling_ratio/min": 0.006704842206090689,
+      "sampling/sampling_logp_difference/max": 5.00492525100708,
+      "sampling/sampling_logp_difference/mean": 0.01947362720966339,
+      "step": 30
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 2.8460265411922592e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.8460265411922592e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15386.0,
+      "completions/mean_length": 6294.1484375,
+      "completions/mean_terminated_length": 6133.9921875,
+      "completions/min_length": 548.0,
+      "completions/min_terminated_length": 548.0,
+      "entropy": 1.2036212533712387,
+      "epoch": 0.028518859245630176,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0021383841522037983,
+      "learning_rate": 1e-05,
+      "loss": 0.033,
+      "num_tokens": 22124812.0,
+      "reward": 0.171875,
+      "reward_std": 0.20752590894699097,
+      "rewards/accuracy_reward/mean": 0.171875,
+      "rewards/accuracy_reward/std": 0.3787541687488556,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999858736991882,
+      "sampling/importance_sampling_ratio/min": 3.9575263599544996e-07,
+      "sampling/sampling_logp_difference/max": 14.742476463317871,
+      "sampling/sampling_logp_difference/mean": 0.022367021068930626,
+      "step": 31
+    },
+    {
+      "clip_ratio/high_max": 1.73864664247958e-05,
+      "clip_ratio/high_mean": 4.34661660619895e-06,
+      "clip_ratio/low_mean": 3.19569651310303e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.630358173722925e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14893.0,
+      "completions/mean_length": 6011.4921875,
+      "completions/mean_terminated_length": 5929.81884765625,
+      "completions/min_length": 2.0,
+      "completions/min_terminated_length": 2.0,
+      "entropy": 1.123318687081337,
+      "epoch": 0.029438822447102116,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.00126531848218292,
+      "learning_rate": 1e-05,
+      "loss": 0.0119,
+      "num_tokens": 22915091.0,
+      "reward": 0.171875,
+      "reward_std": 0.2330477386713028,
+      "rewards/accuracy_reward/mean": 0.171875,
+      "rewards/accuracy_reward/std": 0.3787541687488556,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999861121177673,
+      "sampling/importance_sampling_ratio/min": 1.6368276192224585e-05,
+      "sampling/sampling_logp_difference/max": 11.02016544342041,
+      "sampling/sampling_logp_difference/mean": 0.019905246794223785,
+      "step": 32
+    },
+    {
+      "clip_ratio/high_max": 2.8753217975463485e-05,
+      "clip_ratio/high_mean": 7.188304493865871e-06,
+      "clip_ratio/low_mean": 3.818478444372886e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.537308905128157e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16332.0,
+      "completions/mean_length": 5152.46875,
+      "completions/mean_terminated_length": 5064.03125,
+      "completions/min_length": 128.0,
+      "completions/min_terminated_length": 128.0,
+      "entropy": 1.0477670058608055,
+      "epoch": 0.03035878564857406,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0030069497879594564,
+      "learning_rate": 1e-05,
+      "loss": 0.1026,
+      "num_tokens": 23596487.0,
+      "reward": 0.3359375,
+      "reward_std": 0.29142576456069946,
+      "rewards/accuracy_reward/mean": 0.3359375,
+      "rewards/accuracy_reward/std": 0.47417303919792175,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999433755874634,
+      "sampling/importance_sampling_ratio/min": 9.009604013954231e-07,
+      "sampling/sampling_logp_difference/max": 13.919804573059082,
+      "sampling/sampling_logp_difference/mean": 0.019003981724381447,
+      "step": 33
+    },
+    {
+      "clip_ratio/high_max": 3.069575450354023e-05,
+      "clip_ratio/high_mean": 7.673938625885057e-06,
+      "clip_ratio/low_mean": 3.4847614415411954e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.252155258654966e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 12792.0,
+      "completions/max_terminated_length": 12792.0,
+      "completions/mean_length": 4672.5703125,
+      "completions/mean_terminated_length": 4672.5703125,
+      "completions/min_length": 4.0,
+      "completions/min_terminated_length": 4.0,
+      "entropy": 0.9471446052193642,
+      "epoch": 0.031278748850046,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.002676331205293536,
+      "learning_rate": 1e-05,
+      "loss": 0.0724,
+      "num_tokens": 24213408.0,
+      "reward": 0.3203125,
+      "reward_std": 0.2988021969795227,
+      "rewards/accuracy_reward/mean": 0.3203125,
+      "rewards/accuracy_reward/std": 0.4684300124645233,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000251531600952,
+      "sampling/importance_sampling_ratio/min": 0.0013351094676181674,
+      "sampling/sampling_logp_difference/max": 6.618741989135742,
+      "sampling/sampling_logp_difference/mean": 0.0179576613008976,
+      "step": 34
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 2.6127243245355203e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.6127243245355203e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16108.0,
+      "completions/mean_length": 7013.734375,
+      "completions/mean_terminated_length": 6711.4677734375,
+      "completions/min_length": 326.0,
+      "completions/min_terminated_length": 326.0,
+      "entropy": 1.1254516392946243,
+      "epoch": 0.03219871205151794,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0023615453392267227,
+      "learning_rate": 1e-05,
+      "loss": 0.0384,
+      "num_tokens": 25130262.0,
+      "reward": 0.1953125,
+      "reward_std": 0.26485776901245117,
+      "rewards/accuracy_reward/mean": 0.1953125,
+      "rewards/accuracy_reward/std": 0.3979988098144531,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999954342842102,
+      "sampling/importance_sampling_ratio/min": 6.6197676460433286e-06,
+      "sampling/sampling_logp_difference/max": 11.925450325012207,
+      "sampling/sampling_logp_difference/mean": 0.0215257927775383,
+      "step": 35
+    },
+    {
+      "clip_ratio/high_max": 4.06954040954588e-06,
+      "clip_ratio/high_mean": 1.01738510238647e-06,
+      "clip_ratio/low_mean": 4.180071573500754e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.281810015527299e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15673.0,
+      "completions/mean_length": 5858.59375,
+      "completions/mean_terminated_length": 5605.984375,
+      "completions/min_length": 189.0,
+      "completions/min_terminated_length": 189.0,
+      "entropy": 1.0713739022612572,
+      "epoch": 0.03311867525298988,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0029018481727689505,
+      "learning_rate": 1e-05,
+      "loss": 0.1041,
+      "num_tokens": 25898194.0,
+      "reward": 0.3671875,
+      "reward_std": 0.29036980867385864,
+      "rewards/accuracy_reward/mean": 0.3671875,
+      "rewards/accuracy_reward/std": 0.4839322865009308,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999915957450867,
+      "sampling/importance_sampling_ratio/min": 1.6834765119710937e-05,
+      "sampling/sampling_logp_difference/max": 10.992064476013184,
+      "sampling/sampling_logp_difference/mean": 0.019959844648838043,
+      "step": 36
+    },
+    {
+      "clip_ratio/high_max": 1.2810827229259303e-05,
+      "clip_ratio/high_mean": 3.2027068073148257e-06,
+      "clip_ratio/low_mean": 3.29701083501277e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.617281504375569e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14004.0,
+      "completions/mean_length": 6952.6015625,
+      "completions/mean_terminated_length": 6726.24853515625,
+      "completions/min_length": 3.0,
+      "completions/min_terminated_length": 3.0,
+      "entropy": 1.028619796037674,
+      "epoch": 0.03403863845446182,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0022342968732118607,
+      "learning_rate": 1e-05,
+      "loss": 0.0637,
+      "num_tokens": 26812791.0,
+      "reward": 0.234375,
+      "reward_std": 0.26827272772789,
+      "rewards/accuracy_reward/mean": 0.234375,
+      "rewards/accuracy_reward/std": 0.42527204751968384,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999532699584961,
+      "sampling/importance_sampling_ratio/min": 4.540153167909011e-05,
+      "sampling/sampling_logp_difference/max": 9.999964714050293,
+      "sampling/sampling_logp_difference/mean": 0.02002539485692978,
+      "step": 37
+    },
+    {
+      "clip_ratio/high_max": 1.5225089100567857e-05,
+      "clip_ratio/high_mean": 6.960676159906143e-06,
+      "clip_ratio/low_mean": 4.09088329433871e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.7869508762232726e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16361.0,
+      "completions/mean_length": 6413.421875,
+      "completions/mean_terminated_length": 6174.12841796875,
+      "completions/min_length": 2.0,
+      "completions/min_terminated_length": 2.0,
+      "entropy": 0.9452399462461472,
+      "epoch": 0.034958601655933765,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0021800603717565536,
+      "learning_rate": 1e-05,
+      "loss": 0.0275,
+      "num_tokens": 27652757.0,
+      "reward": 0.296875,
+      "reward_std": 0.31246688961982727,
+      "rewards/accuracy_reward/mean": 0.296875,
+      "rewards/accuracy_reward/std": 0.45867621898651123,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999439120292664,
+      "sampling/importance_sampling_ratio/min": 3.895394547726028e-05,
+      "sampling/sampling_logp_difference/max": 10.153130531311035,
+      "sampling/sampling_logp_difference/mean": 0.019722118973731995,
+      "step": 38
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 3.9564903318023426e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.9564903318023426e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15754.0,
+      "completions/max_terminated_length": 15754.0,
+      "completions/mean_length": 5176.3515625,
+      "completions/mean_terminated_length": 5176.3515625,
+      "completions/min_length": 130.0,
+      "completions/min_terminated_length": 130.0,
+      "entropy": 1.0444758981466293,
+      "epoch": 0.035878564857405704,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.004153470974415541,
+      "learning_rate": 1e-05,
+      "loss": 0.0798,
+      "num_tokens": 28334386.0,
+      "reward": 0.2734375,
+      "reward_std": 0.32035762071609497,
+      "rewards/accuracy_reward/mean": 0.2734375,
+      "rewards/accuracy_reward/std": 0.447474867105484,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999774694442749,
+      "sampling/importance_sampling_ratio/min": 0.007421077694743872,
+      "sampling/sampling_logp_difference/max": 4.903430938720703,
+      "sampling/sampling_logp_difference/mean": 0.020159056410193443,
+      "step": 39
+    },
+    {
+      "clip_ratio/high_max": 1.725743459246587e-05,
+      "clip_ratio/high_mean": 4.3143586481164675e-06,
+      "clip_ratio/low_mean": 2.0204584302518924e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.451894306432223e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15554.0,
+      "completions/mean_length": 5178.9921875,
+      "completions/mean_terminated_length": 5001.13525390625,
+      "completions/min_length": 4.0,
+      "completions/min_terminated_length": 4.0,
+      "entropy": 1.0803537145256996,
+      "epoch": 0.03679852805887764,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.002477057045325637,
+      "learning_rate": 1e-05,
+      "loss": 0.0067,
+      "num_tokens": 29017145.0,
+      "reward": 0.2890625,
+      "reward_std": 0.29932135343551636,
+      "rewards/accuracy_reward/mean": 0.2890625,
+      "rewards/accuracy_reward/std": 0.45510825514793396,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000497102737427,
+      "sampling/importance_sampling_ratio/min": 0.004630985204130411,
+      "sampling/sampling_logp_difference/max": 5.374985694885254,
+      "sampling/sampling_logp_difference/mean": 0.019826076924800873,
+      "step": 40
+    },
+    {
+      "clip_ratio/high_max": 1.6637992303003557e-05,
+      "clip_ratio/high_mean": 4.159498075750889e-06,
+      "clip_ratio/low_mean": 2.1970684144889674e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.6130182106953725e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 14131.0,
+      "completions/max_terminated_length": 14131.0,
+      "completions/mean_length": 4980.359375,
+      "completions/mean_terminated_length": 4980.359375,
+      "completions/min_length": 329.0,
+      "completions/min_terminated_length": 329.0,
+      "entropy": 0.9510642662644386,
+      "epoch": 0.03771849126034959,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0016275218222290277,
+      "learning_rate": 1e-05,
+      "loss": -0.0097,
+      "num_tokens": 29673535.0,
+      "reward": 0.4375,
+      "reward_std": 0.26249876618385315,
+      "rewards/accuracy_reward/mean": 0.4375,
+      "rewards/accuracy_reward/std": 0.49802759289741516,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999750852584839,
+      "sampling/importance_sampling_ratio/min": 0.000599516904912889,
+      "sampling/sampling_logp_difference/max": 7.419386386871338,
+      "sampling/sampling_logp_difference/mean": 0.01844976656138897,
+      "step": 41
+    },
+    {
+      "clip_ratio/high_max": 2.8087193186365766e-05,
+      "clip_ratio/high_mean": 7.021798296591442e-06,
+      "clip_ratio/low_mean": 3.9683913541921356e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.670571286169434e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15328.0,
+      "completions/mean_length": 5778.6953125,
+      "completions/mean_terminated_length": 5695.18896484375,
+      "completions/min_length": 691.0,
+      "completions/min_terminated_length": 691.0,
+      "entropy": 1.0413239300251007,
+      "epoch": 0.03863845446182153,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.001847646082751453,
+      "learning_rate": 1e-05,
+      "loss": -0.0045,
+      "num_tokens": 30436416.0,
+      "reward": 0.2578125,
+      "reward_std": 0.33903977274894714,
+      "rewards/accuracy_reward/mean": 0.2578125,
+      "rewards/accuracy_reward/std": 0.43914902210235596,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998501539230347,
+      "sampling/importance_sampling_ratio/min": 0.00020348970429040492,
+      "sampling/sampling_logp_difference/max": 8.499895095825195,
+      "sampling/sampling_logp_difference/mean": 0.021502099931240082,
+      "step": 42
+    },
+    {
+      "clip_ratio/high_max": 2.68402091023745e-05,
+      "clip_ratio/high_mean": 8.575278570788214e-06,
+      "clip_ratio/low_mean": 4.547183698377921e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.404711600931478e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 14182.0,
+      "completions/max_terminated_length": 14182.0,
+      "completions/mean_length": 4875.125,
+      "completions/mean_terminated_length": 4875.125,
+      "completions/min_length": 349.0,
+      "completions/min_terminated_length": 349.0,
+      "entropy": 1.0464690178632736,
+      "epoch": 0.03955841766329347,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0021134833805263042,
+      "learning_rate": 1e-05,
+      "loss": 0.0727,
+      "num_tokens": 31083672.0,
+      "reward": 0.40625,
+      "reward_std": 0.3584783971309662,
+      "rewards/accuracy_reward/mean": 0.40625,
+      "rewards/accuracy_reward/std": 0.4930621087551117,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999340176582336,
+      "sampling/importance_sampling_ratio/min": 0.012113225646317005,
+      "sampling/sampling_logp_difference/max": 4.41345739364624,
+      "sampling/sampling_logp_difference/mean": 0.019140049815177917,
+      "step": 43
+    },
+    {
+      "clip_ratio/high_max": 3.9877967992651975e-05,
+      "clip_ratio/high_mean": 9.969491998162994e-06,
+      "clip_ratio/low_mean": 3.981287841270387e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.9782369273998484e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15959.0,
+      "completions/mean_length": 4691.421875,
+      "completions/mean_terminated_length": 4505.82568359375,
+      "completions/min_length": 296.0,
+      "completions/min_terminated_length": 296.0,
+      "entropy": 1.0229775309562683,
+      "epoch": 0.040478380864765406,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0037735572550445795,
+      "learning_rate": 1e-05,
+      "loss": 0.0603,
+      "num_tokens": 31703654.0,
+      "reward": 0.4453125,
+      "reward_std": 0.2993389964103699,
+      "rewards/accuracy_reward/mean": 0.4453125,
+      "rewards/accuracy_reward/std": 0.4989531338214874,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999492168426514,
+      "sampling/importance_sampling_ratio/min": 0.03150063753128052,
+      "sampling/sampling_logp_difference/max": 3.457747459411621,
+      "sampling/sampling_logp_difference/mean": 0.01912039890885353,
+      "step": 44
+    },
+    {
+      "clip_ratio/high_max": 3.5441889849607833e-06,
+      "clip_ratio/high_mean": 8.860472462401958e-07,
+      "clip_ratio/low_mean": 1.5137359810069029e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 1.6023407056309225e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15605.0,
+      "completions/mean_length": 6821.96875,
+      "completions/mean_terminated_length": 6592.48046875,
+      "completions/min_length": 1196.0,
+      "completions/min_terminated_length": 1196.0,
+      "entropy": 1.1132484003901482,
+      "epoch": 0.04139834406623735,
+      "frac_reward_zero_std": 0.625,
+      "grad_norm": 0.0010448681423440576,
+      "learning_rate": 1e-05,
+      "loss": 0.022,
+      "num_tokens": 32599778.0,
+      "reward": 0.2265625,
+      "reward_std": 0.1814819872379303,
+      "rewards/accuracy_reward/mean": 0.2265625,
+      "rewards/accuracy_reward/std": 0.4202519655227661,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999915361404419,
+      "sampling/importance_sampling_ratio/min": 0.006500681862235069,
+      "sampling/sampling_logp_difference/max": 5.035848140716553,
+      "sampling/sampling_logp_difference/mean": 0.02125459350645542,
+      "step": 45
+    },
+    {
+      "clip_ratio/high_max": 4.652893949241843e-06,
+      "clip_ratio/high_mean": 1.1632234873104608e-06,
+      "clip_ratio/low_mean": 5.731516603191267e-05,
+      "clip_ratio/low_min": 9.891066838463303e-06,
+      "clip_ratio/region_mean": 5.8478389746596804e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15753.0,
+      "completions/mean_length": 6834.3671875,
+      "completions/mean_terminated_length": 6605.17626953125,
+      "completions/min_length": 624.0,
+      "completions/min_terminated_length": 624.0,
+      "entropy": 0.9827468693256378,
+      "epoch": 0.04231830726770929,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.0017670176457613707,
+      "learning_rate": 1e-05,
+      "loss": 0.1105,
+      "num_tokens": 33492737.0,
+      "reward": 0.3046875,
+      "reward_std": 0.3440523147583008,
+      "rewards/accuracy_reward/mean": 0.3046875,
+      "rewards/accuracy_reward/std": 0.46208351850509644,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999089241027832,
+      "sampling/importance_sampling_ratio/min": 0.0021202093921601772,
+      "sampling/sampling_logp_difference/max": 6.156240463256836,
+      "sampling/sampling_logp_difference/mean": 0.019490526989102364,
+      "step": 46
+    },
+    {
+      "clip_ratio/high_max": 6.717360520269722e-06,
+      "clip_ratio/high_mean": 2.503530367903295e-06,
+      "clip_ratio/low_mean": 2.5672919832686603e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.8176450200589898e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14098.0,
+      "completions/mean_length": 6175.296875,
+      "completions/mean_terminated_length": 5845.98388671875,
+      "completions/min_length": 558.0,
+      "completions/min_terminated_length": 558.0,
+      "entropy": 1.1584237962961197,
+      "epoch": 0.04323827046918123,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.0016891945851966739,
+      "learning_rate": 1e-05,
+      "loss": -0.0008,
+      "num_tokens": 34312455.0,
+      "reward": 0.1875,
+      "reward_std": 0.19673937559127808,
+      "rewards/accuracy_reward/mean": 0.1875,
+      "rewards/accuracy_reward/std": 0.39184603095054626,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999643564224243,
+      "sampling/importance_sampling_ratio/min": 8.086384332273155e-05,
+      "sampling/sampling_logp_difference/max": 9.422743797302246,
+      "sampling/sampling_logp_difference/mean": 0.021749887615442276,
+      "step": 47
+    },
+    {
+      "clip_ratio/high_max": 2.2362002255249536e-05,
+      "clip_ratio/high_mean": 8.189798336388776e-06,
+      "clip_ratio/low_mean": 2.1058204993096297e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.9248002192616696e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16054.0,
+      "completions/mean_length": 6036.8359375,
+      "completions/mean_terminated_length": 5955.3623046875,
+      "completions/min_length": 510.0,
+      "completions/min_terminated_length": 510.0,
+      "entropy": 0.9301538467407227,
+      "epoch": 0.04415823367065318,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.003834392176941037,
+      "learning_rate": 1e-05,
+      "loss": 0.0636,
+      "num_tokens": 35102738.0,
+      "reward": 0.4375,
+      "reward_std": 0.36614155769348145,
+      "rewards/accuracy_reward/mean": 0.4375,
+      "rewards/accuracy_reward/std": 0.49802759289741516,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998494386672974,
+      "sampling/importance_sampling_ratio/min": 0.00013992394087836146,
+      "sampling/sampling_logp_difference/max": 8.874411582946777,
+      "sampling/sampling_logp_difference/mean": 0.019147861748933792,
+      "step": 48
+    },
+    {
+      "clip_ratio/high_max": 1.1501961580506759e-05,
+      "clip_ratio/high_mean": 2.8754903951266897e-06,
+      "clip_ratio/low_mean": 4.08189714562468e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.369446196506033e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15594.0,
+      "completions/mean_length": 6262.46875,
+      "completions/mean_terminated_length": 5764.68798828125,
+      "completions/min_length": 210.0,
+      "completions/min_terminated_length": 210.0,
+      "entropy": 0.8599015846848488,
+      "epoch": 0.045078196872125116,
+      "frac_reward_zero_std": 0.125,
+      "grad_norm": 0.0029804729856550694,
+      "learning_rate": 1e-05,
+      "loss": 0.0495,
+      "num_tokens": 35924886.0,
+      "reward": 0.3984375,
+      "reward_std": 0.3911295533180237,
+      "rewards/accuracy_reward/mean": 0.3984375,
+      "rewards/accuracy_reward/std": 0.4915000796318054,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999922513961792,
+      "sampling/importance_sampling_ratio/min": 0.00021375219512265176,
+      "sampling/sampling_logp_difference/max": 9.904524803161621,
+      "sampling/sampling_logp_difference/mean": 0.01815103553235531,
+      "step": 49
+    },
+    {
+      "clip_ratio/high_max": 2.4107544049911667e-05,
+      "clip_ratio/high_mean": 6.026886012477917e-06,
+      "clip_ratio/low_mean": 3.6588148361715866e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.261503391944643e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 14556.0,
+      "completions/max_terminated_length": 14556.0,
+      "completions/mean_length": 5926.8984375,
+      "completions/mean_terminated_length": 5926.8984375,
+      "completions/min_length": 346.0,
+      "completions/min_terminated_length": 346.0,
+      "entropy": 1.0042993426322937,
+      "epoch": 0.045998160073597055,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0022071697749197483,
+      "learning_rate": 1e-05,
+      "loss": 0.0059,
+      "num_tokens": 36700913.0,
+      "reward": 0.3359375,
+      "reward_std": 0.3306073546409607,
+      "rewards/accuracy_reward/mean": 0.3359375,
+      "rewards/accuracy_reward/std": 0.47417303919792175,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.000010371208191,
+      "sampling/importance_sampling_ratio/min": 0.0005220364546403289,
+      "sampling/sampling_logp_difference/max": 7.557773113250732,
+      "sampling/sampling_logp_difference/mean": 0.01954064890742302,
+      "step": 50
+    },
+    {
+      "clip_ratio/high_max": 4.9106265578302555e-06,
+      "clip_ratio/high_mean": 1.2276566394575639e-06,
+      "clip_ratio/low_mean": 2.634599570683349e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.7573652346291055e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15217.0,
+      "completions/mean_length": 6873.6875,
+      "completions/mean_terminated_length": 6645.4404296875,
+      "completions/min_length": 505.0,
+      "completions/min_terminated_length": 505.0,
+      "entropy": 1.0255412608385086,
+      "epoch": 0.046918123275068994,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.002320924773812294,
+      "learning_rate": 1e-05,
+      "loss": 0.0508,
+      "num_tokens": 37604865.0,
+      "reward": 0.234375,
+      "reward_std": 0.3135228157043457,
+      "rewards/accuracy_reward/mean": 0.234375,
+      "rewards/accuracy_reward/std": 0.42527204751968384,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999098777770996,
+      "sampling/importance_sampling_ratio/min": 0.026153141632676125,
+      "sampling/sampling_logp_difference/max": 3.6437859535217285,
+      "sampling/sampling_logp_difference/mean": 0.019532475620508194,
+      "step": 51
+    },
+    {
+      "clip_ratio/high_max": 1.6350510122720152e-05,
+      "clip_ratio/high_mean": 4.087627530680038e-06,
+      "clip_ratio/low_mean": 2.351988746340794e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.7607515221461654e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15668.0,
+      "completions/mean_length": 6073.8984375,
+      "completions/mean_terminated_length": 5992.71630859375,
+      "completions/min_length": 654.0,
+      "completions/min_terminated_length": 654.0,
+      "entropy": 1.0713753998279572,
+      "epoch": 0.04783808647654094,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.002212709980085492,
+      "learning_rate": 1e-05,
+      "loss": 0.0668,
+      "num_tokens": 38405196.0,
+      "reward": 0.359375,
+      "reward_std": 0.22119548916816711,
+      "rewards/accuracy_reward/mean": 0.359375,
+      "rewards/accuracy_reward/std": 0.481702595949173,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998978972434998,
+      "sampling/importance_sampling_ratio/min": 8.706459084351081e-06,
+      "sampling/sampling_logp_difference/max": 11.651445388793945,
+      "sampling/sampling_logp_difference/mean": 0.021252838894724846,
+      "step": 52
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 3.729486718384578e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.729486718384578e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15299.0,
+      "completions/mean_length": 5838.71875,
+      "completions/mean_terminated_length": 5671.33349609375,
+      "completions/min_length": 331.0,
+      "completions/min_terminated_length": 331.0,
+      "entropy": 1.021155133843422,
+      "epoch": 0.04875804967801288,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.001135052996687591,
+      "learning_rate": 1e-05,
+      "loss": 0.0178,
+      "num_tokens": 39171704.0,
+      "reward": 0.28125,
+      "reward_std": 0.23410367965698242,
+      "rewards/accuracy_reward/mean": 0.28125,
+      "rewards/accuracy_reward/std": 0.4513758420944214,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999173879623413,
+      "sampling/importance_sampling_ratio/min": 0.003084881929680705,
+      "sampling/sampling_logp_difference/max": 5.7812418937683105,
+      "sampling/sampling_logp_difference/mean": 0.020781882107257843,
+      "step": 53
+    },
+    {
+      "clip_ratio/high_max": 1.7124169744420215e-05,
+      "clip_ratio/high_mean": 4.281042436105054e-06,
+      "clip_ratio/low_mean": 3.706903294187214e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.135007543482061e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 14617.0,
+      "completions/max_terminated_length": 14617.0,
+      "completions/mean_length": 6358.5859375,
+      "completions/mean_terminated_length": 6358.5859375,
+      "completions/min_length": 940.0,
+      "completions/min_terminated_length": 940.0,
+      "entropy": 0.9720487147569656,
+      "epoch": 0.04967801287948482,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.002638082252815366,
+      "learning_rate": 1e-05,
+      "loss": 0.0145,
+      "num_tokens": 40003859.0,
+      "reward": 0.40625,
+      "reward_std": 0.3174618184566498,
+      "rewards/accuracy_reward/mean": 0.40625,
+      "rewards/accuracy_reward/std": 0.4930621087551117,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000380277633667,
+      "sampling/importance_sampling_ratio/min": 0.01960253342986107,
+      "sampling/sampling_logp_difference/max": 3.932096481323242,
+      "sampling/sampling_logp_difference/mean": 0.01991666667163372,
+      "step": 54
+    },
+    {
+      "clip_ratio/high_max": 6.55582925901399e-06,
+      "clip_ratio/high_mean": 2.994117721755174e-06,
+      "clip_ratio/low_mean": 2.222621503733535e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.5220332759090525e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 14753.0,
+      "completions/max_terminated_length": 14753.0,
+      "completions/mean_length": 4634.1875,
+      "completions/mean_terminated_length": 4634.1875,
+      "completions/min_length": 266.0,
+      "completions/min_terminated_length": 266.0,
+      "entropy": 0.9715309366583824,
+      "epoch": 0.050597976080956765,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.001994960242882371,
+      "learning_rate": 1e-05,
+      "loss": 0.0221,
+      "num_tokens": 40616483.0,
+      "reward": 0.4375,
+      "reward_std": 0.29644322395324707,
+      "rewards/accuracy_reward/mean": 0.4375,
+      "rewards/accuracy_reward/std": 0.49802759289741516,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000698566436768,
+      "sampling/importance_sampling_ratio/min": 1.0510009815334342e-05,
+      "sampling/sampling_logp_difference/max": 11.46318244934082,
+      "sampling/sampling_logp_difference/mean": 0.01902047172188759,
+      "step": 55
+    },
+    {
+      "clip_ratio/high_max": 2.2474248908110894e-05,
+      "clip_ratio/high_mean": 7.571314540655294e-06,
+      "clip_ratio/low_mean": 4.3583780325207044e-05,
+      "clip_ratio/low_min": 4.6013396968191955e-06,
+      "clip_ratio/region_mean": 5.1155094070054474e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15953.0,
+      "completions/mean_length": 6596.25,
+      "completions/mean_terminated_length": 6361.34423828125,
+      "completions/min_length": 318.0,
+      "completions/min_terminated_length": 318.0,
+      "entropy": 0.8207943215966225,
+      "epoch": 0.051517939282428704,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0019902780186384916,
+      "learning_rate": 1e-05,
+      "loss": 0.0506,
+      "num_tokens": 41484443.0,
+      "reward": 0.4453125,
+      "reward_std": 0.326668381690979,
+      "rewards/accuracy_reward/mean": 0.4453125,
+      "rewards/accuracy_reward/std": 0.4989531338214874,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.000016689300537,
+      "sampling/importance_sampling_ratio/min": 7.485233072657138e-05,
+      "sampling/sampling_logp_difference/max": 9.499993324279785,
+      "sampling/sampling_logp_difference/mean": 0.018301833420991898,
+      "step": 56
+    },
+    {
+      "clip_ratio/high_max": 3.0019932637515012e-06,
+      "clip_ratio/high_mean": 7.504983159378753e-07,
+      "clip_ratio/low_mean": 4.332785601945943e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.407835376696312e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15834.0,
+      "completions/mean_length": 6785.75,
+      "completions/mean_terminated_length": 6313.70458984375,
+      "completions/min_length": 393.0,
+      "completions/min_terminated_length": 393.0,
+      "entropy": 0.9876058474183083,
+      "epoch": 0.05243790248390064,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0015235114842653275,
+      "learning_rate": 1e-05,
+      "loss": 0.0128,
+      "num_tokens": 42372235.0,
+      "reward": 0.2421875,
+      "reward_std": 0.325075626373291,
+      "rewards/accuracy_reward/mean": 0.2421875,
+      "rewards/accuracy_reward/std": 0.4300905168056488,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999551773071289,
+      "sampling/importance_sampling_ratio/min": 0.026679370552301407,
+      "sampling/sampling_logp_difference/max": 3.6238646507263184,
+      "sampling/sampling_logp_difference/mean": 0.019945615902543068,
+      "step": 57
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 2.1349006601667497e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.1349006601667497e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14726.0,
+      "completions/mean_length": 4881.2109375,
+      "completions/mean_terminated_length": 4510.1533203125,
+      "completions/min_length": 437.0,
+      "completions/min_terminated_length": 437.0,
+      "entropy": 0.989942155778408,
+      "epoch": 0.05335786568537258,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.002033712575212121,
+      "learning_rate": 1e-05,
+      "loss": 0.1088,
+      "num_tokens": 43015238.0,
+      "reward": 0.4375,
+      "reward_std": 0.2869548797607422,
+      "rewards/accuracy_reward/mean": 0.4375,
+      "rewards/accuracy_reward/std": 0.49802759289741516,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000300407409668,
+      "sampling/importance_sampling_ratio/min": 0.0001238943514181301,
+      "sampling/sampling_logp_difference/max": 8.996081352233887,
+      "sampling/sampling_logp_difference/mean": 0.01887543685734272,
+      "step": 58
+    },
+    {
+      "clip_ratio/high_max": 2.584004687378183e-05,
+      "clip_ratio/high_mean": 6.4600117184454575e-06,
+      "clip_ratio/low_mean": 2.1371045761497953e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.7831058105221018e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15001.0,
+      "completions/max_terminated_length": 15001.0,
+      "completions/mean_length": 4725.3984375,
+      "completions/mean_terminated_length": 4725.3984375,
+      "completions/min_length": 117.0,
+      "completions/min_terminated_length": 117.0,
+      "entropy": 1.0350637435913086,
+      "epoch": 0.05427782888684453,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0030296226032078266,
+      "learning_rate": 1e-05,
+      "loss": 0.0691,
+      "num_tokens": 43637737.0,
+      "reward": 0.4453125,
+      "reward_std": 0.32035762071609497,
+      "rewards/accuracy_reward/mean": 0.4453125,
+      "rewards/accuracy_reward/std": 0.4989531338214874,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999939203262329,
+      "sampling/importance_sampling_ratio/min": 0.00022932067804504186,
+      "sampling/sampling_logp_difference/max": 8.380389213562012,
+      "sampling/sampling_logp_difference/mean": 0.01995944231748581,
+      "step": 59
+    },
+    {
+      "clip_ratio/high_max": 1.994733975152485e-05,
+      "clip_ratio/high_mean": 4.986834937881213e-06,
+      "clip_ratio/low_mean": 3.5168303838872816e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.015513832200668e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16240.0,
+      "completions/mean_length": 4918.171875,
+      "completions/mean_terminated_length": 4736.1748046875,
+      "completions/min_length": 388.0,
+      "completions/min_terminated_length": 388.0,
+      "entropy": 0.965274304151535,
+      "epoch": 0.05519779208831647,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.002758471528068185,
+      "learning_rate": 1e-05,
+      "loss": 0.0845,
+      "num_tokens": 44285327.0,
+      "reward": 0.328125,
+      "reward_std": 0.27328526973724365,
+      "rewards/accuracy_reward/mean": 0.328125,
+      "rewards/accuracy_reward/std": 0.4713755249977112,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999663233757019,
+      "sampling/importance_sampling_ratio/min": 0.010958661325275898,
+      "sampling/sampling_logp_difference/max": 4.513625144958496,
+      "sampling/sampling_logp_difference/mean": 0.019083233550190926,
+      "step": 60
+    },
+    {
+      "clip_ratio/high_max": 1.0621563887980301e-05,
+      "clip_ratio/high_mean": 2.6553909719950752e-06,
+      "clip_ratio/low_mean": 3.838553107016196e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.1040922042157035e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15031.0,
+      "completions/mean_length": 4998.2890625,
+      "completions/mean_terminated_length": 4908.6376953125,
+      "completions/min_length": 524.0,
+      "completions/min_terminated_length": 524.0,
+      "entropy": 0.9200445115566254,
+      "epoch": 0.05611775528978841,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.0027611786499619484,
+      "learning_rate": 1e-05,
+      "loss": 0.0575,
+      "num_tokens": 44944356.0,
+      "reward": 0.3515625,
+      "reward_std": 0.3895368278026581,
+      "rewards/accuracy_reward/mean": 0.3515625,
+      "rewards/accuracy_reward/std": 0.4793342351913452,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999884366989136,
+      "sampling/importance_sampling_ratio/min": 0.0018651526188477874,
+      "sampling/sampling_logp_difference/max": 6.284412384033203,
+      "sampling/sampling_logp_difference/mean": 0.017853498458862305,
+      "step": 61
+    },
+    {
+      "clip_ratio/high_max": 1.0136624496226432e-05,
+      "clip_ratio/high_mean": 2.534156124056608e-06,
+      "clip_ratio/low_mean": 2.0260404085092887e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.2794560095462657e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16110.0,
+      "completions/mean_length": 6290.1796875,
+      "completions/mean_terminated_length": 6129.96044921875,
+      "completions/min_length": 302.0,
+      "completions/min_terminated_length": 302.0,
+      "entropy": 0.9360214695334435,
+      "epoch": 0.05703771849126035,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0015557854203507304,
+      "learning_rate": 1e-05,
+      "loss": 0.0111,
+      "num_tokens": 45767867.0,
+      "reward": 0.34375,
+      "reward_std": 0.30168038606643677,
+      "rewards/accuracy_reward/mean": 0.34375,
+      "rewards/accuracy_reward/std": 0.47682511806488037,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999427795410156,
+      "sampling/importance_sampling_ratio/min": 0.0011004531988874078,
+      "sampling/sampling_logp_difference/max": 6.812033176422119,
+      "sampling/sampling_logp_difference/mean": 0.0200855303555727,
+      "step": 62
+    },
+    {
+      "clip_ratio/high_max": 2.2559511307918e-06,
+      "clip_ratio/high_mean": 5.6398778269795e-07,
+      "clip_ratio/low_mean": 4.51761221711422e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.574010984015331e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16366.0,
+      "completions/mean_length": 6486.15625,
+      "completions/mean_terminated_length": 6248.6083984375,
+      "completions/min_length": 77.0,
+      "completions/min_terminated_length": 77.0,
+      "entropy": 0.863138921558857,
+      "epoch": 0.05795768169273229,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0026953541673719883,
+      "learning_rate": 1e-05,
+      "loss": -0.0194,
+      "num_tokens": 46618575.0,
+      "reward": 0.2578125,
+      "reward_std": 0.2580180764198303,
+      "rewards/accuracy_reward/mean": 0.2578125,
+      "rewards/accuracy_reward/std": 0.43914902210235596,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999406337738037,
+      "sampling/importance_sampling_ratio/min": 0.0011708897072821856,
+      "sampling/sampling_logp_difference/max": 6.749991416931152,
+      "sampling/sampling_logp_difference/mean": 0.01863238587975502,
+      "step": 63
+    },
+    {
+      "clip_ratio/high_max": 1.0073357771034352e-05,
+      "clip_ratio/high_mean": 2.518339442758588e-06,
+      "clip_ratio/low_mean": 2.787370635815023e-05,
+      "clip_ratio/low_min": 3.837534222839167e-06,
+      "clip_ratio/region_mean": 3.0392045573535142e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16010.0,
+      "completions/mean_length": 6442.7734375,
+      "completions/mean_terminated_length": 6284.9765625,
+      "completions/min_length": 776.0,
+      "completions/min_terminated_length": 776.0,
+      "entropy": 1.0242054909467697,
+      "epoch": 0.05887764489420423,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0024442619178444147,
+      "learning_rate": 1e-05,
+      "loss": 0.0569,
+      "num_tokens": 47462274.0,
+      "reward": 0.328125,
+      "reward_std": 0.2777610421180725,
+      "rewards/accuracy_reward/mean": 0.328125,
+      "rewards/accuracy_reward/std": 0.4713755249977112,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998892545700073,
+      "sampling/importance_sampling_ratio/min": 4.9445447736218284e-09,
+      "sampling/sampling_logp_difference/max": 19.124980926513672,
+      "sampling/sampling_logp_difference/mean": 0.019810764119029045,
+      "step": 64
+    },
+    {
+      "clip_ratio/high_max": 1.220810372615233e-05,
+      "clip_ratio/high_mean": 3.0520259315380827e-06,
+      "clip_ratio/low_mean": 4.339240456374682e-05,
+      "clip_ratio/low_min": 4.491233084991109e-06,
+      "clip_ratio/region_mean": 4.644443038159807e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15624.0,
+      "completions/mean_length": 4807.765625,
+      "completions/mean_terminated_length": 4716.6142578125,
+      "completions/min_length": 272.0,
+      "completions/min_terminated_length": 272.0,
+      "entropy": 1.045751042664051,
+      "epoch": 0.05979760809567617,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.002512057079002261,
+      "learning_rate": 1e-05,
+      "loss": 0.003,
+      "num_tokens": 48096692.0,
+      "reward": 0.3671875,
+      "reward_std": 0.3435155153274536,
+      "rewards/accuracy_reward/mean": 0.3671875,
+      "rewards/accuracy_reward/std": 0.4839322865009308,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999058842658997,
+      "sampling/importance_sampling_ratio/min": 1.1480136890895665e-05,
+      "sampling/sampling_logp_difference/max": 11.374892234802246,
+      "sampling/sampling_logp_difference/mean": 0.01960371434688568,
+      "step": 65
+    },
+    {
+      "clip_ratio/high_max": 5.37941218681226e-06,
+      "clip_ratio/high_mean": 1.344853046703065e-06,
+      "clip_ratio/low_mean": 3.0161771633174794e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.1506624850408116e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16264.0,
+      "completions/mean_length": 6703.8359375,
+      "completions/mean_terminated_length": 6471.51220703125,
+      "completions/min_length": 813.0,
+      "completions/min_terminated_length": 813.0,
+      "entropy": 1.0592866837978363,
+      "epoch": 0.06071757129714812,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0016389708034694195,
+      "learning_rate": 1e-05,
+      "loss": -0.024,
+      "num_tokens": 48974399.0,
+      "reward": 0.2734375,
+      "reward_std": 0.2585548758506775,
+      "rewards/accuracy_reward/mean": 0.2734375,
+      "rewards/accuracy_reward/std": 0.447474867105484,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999353885650635,
+      "sampling/importance_sampling_ratio/min": 7.4113349910476245e-06,
+      "sampling/sampling_logp_difference/max": 11.8125,
+      "sampling/sampling_logp_difference/mean": 0.020880095660686493,
+      "step": 66
+    },
+    {
+      "clip_ratio/high_max": 7.093600515872822e-06,
+      "clip_ratio/high_mean": 1.7734001289682055e-06,
+      "clip_ratio/low_mean": 4.470584758564655e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.647924811251869e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16295.0,
+      "completions/mean_length": 6140.5078125,
+      "completions/mean_terminated_length": 5724.10546875,
+      "completions/min_length": 462.0,
+      "completions/min_terminated_length": 462.0,
+      "entropy": 1.0998501181602478,
+      "epoch": 0.061637534498620056,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.003946912474930286,
+      "learning_rate": 1e-05,
+      "loss": 0.0448,
+      "num_tokens": 49779920.0,
+      "reward": 0.34375,
+      "reward_std": 0.36796674132347107,
+      "rewards/accuracy_reward/mean": 0.34375,
+      "rewards/accuracy_reward/std": 0.47682511806488037,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999687671661377,
+      "sampling/importance_sampling_ratio/min": 2.849436668839189e-07,
+      "sampling/sampling_logp_difference/max": 15.070974349975586,
+      "sampling/sampling_logp_difference/mean": 0.021355850622057915,
+      "step": 67
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 3.313956779038563e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.313956779038563e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16352.0,
+      "completions/mean_length": 6689.8046875,
+      "completions/mean_terminated_length": 6213.04052734375,
+      "completions/min_length": 422.0,
+      "completions/min_terminated_length": 422.0,
+      "entropy": 0.8561654165387154,
+      "epoch": 0.062557497700092,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.0021656695753335953,
+      "learning_rate": 1e-05,
+      "loss": 0.0283,
+      "num_tokens": 50655023.0,
+      "reward": 0.203125,
+      "reward_std": 0.21723884344100952,
+      "rewards/accuracy_reward/mean": 0.203125,
+      "rewards/accuracy_reward/std": 0.40390563011169434,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999941885471344,
+      "sampling/importance_sampling_ratio/min": 2.836359499269747e-06,
+      "sampling/sampling_logp_difference/max": 12.772989273071289,
+      "sampling/sampling_logp_difference/mean": 0.01873670145869255,
+      "step": 68
+    },
+    {
+      "clip_ratio/high_max": 2.3421607693308033e-05,
+      "clip_ratio/high_mean": 7.242933975248889e-06,
+      "clip_ratio/low_mean": 3.896083626386826e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.620377103492501e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 14330.0,
+      "completions/max_terminated_length": 14330.0,
+      "completions/mean_length": 5707.0078125,
+      "completions/mean_terminated_length": 5707.0078125,
+      "completions/min_length": 625.0,
+      "completions/min_terminated_length": 625.0,
+      "entropy": 1.1396166533231735,
+      "epoch": 0.06347746090156393,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.004121148493140936,
+      "learning_rate": 1e-05,
+      "loss": 0.0397,
+      "num_tokens": 51406536.0,
+      "reward": 0.3125,
+      "reward_std": 0.3066929578781128,
+      "rewards/accuracy_reward/mean": 0.3125,
+      "rewards/accuracy_reward/std": 0.4653336703777313,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999328851699829,
+      "sampling/importance_sampling_ratio/min": 0.0005196487763896585,
+      "sampling/sampling_logp_difference/max": 7.562357425689697,
+      "sampling/sampling_logp_difference/mean": 0.020000409334897995,
+      "step": 69
+    },
+    {
+      "clip_ratio/high_max": 1.82290532393381e-05,
+      "clip_ratio/high_mean": 4.557263309834525e-06,
+      "clip_ratio/low_mean": 2.5275351731579576e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.9832615496161452e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15987.0,
+      "completions/mean_length": 5655.6328125,
+      "completions/mean_terminated_length": 5571.1572265625,
+      "completions/min_length": 157.0,
+      "completions/min_terminated_length": 157.0,
+      "entropy": 0.8928132206201553,
+      "epoch": 0.06439742410303588,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0032538517843931913,
+      "learning_rate": 1e-05,
+      "loss": 0.0627,
+      "num_tokens": 52148473.0,
+      "reward": 0.3984375,
+      "reward_std": 0.29432642459869385,
+      "rewards/accuracy_reward/mean": 0.3984375,
+      "rewards/accuracy_reward/std": 0.4915000796318054,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000033378601074,
+      "sampling/importance_sampling_ratio/min": 0.0017573959194123745,
+      "sampling/sampling_logp_difference/max": 6.343922138214111,
+      "sampling/sampling_logp_difference/mean": 0.018881790339946747,
+      "step": 70
+    },
+    {
+      "clip_ratio/high_max": 1.2836022506235167e-05,
+      "clip_ratio/high_mean": 3.209005626558792e-06,
+      "clip_ratio/low_mean": 3.8109637216621195e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.131864307055366e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16323.0,
+      "completions/mean_length": 7399.7890625,
+      "completions/mean_terminated_length": 7034.5771484375,
+      "completions/min_length": 144.0,
+      "completions/min_terminated_length": 144.0,
+      "entropy": 0.8808257132768631,
+      "epoch": 0.06531738730450783,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.002061733277514577,
+      "learning_rate": 1e-05,
+      "loss": 0.0191,
+      "num_tokens": 53113230.0,
+      "reward": 0.3046875,
+      "reward_std": 0.2580180764198303,
+      "rewards/accuracy_reward/mean": 0.3046875,
+      "rewards/accuracy_reward/std": 0.46208351850509644,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999673962593079,
+      "sampling/importance_sampling_ratio/min": 0.005283349193632603,
+      "sampling/sampling_logp_difference/max": 5.243195056915283,
+      "sampling/sampling_logp_difference/mean": 0.018456293269991875,
+      "step": 71
+    },
+    {
+      "clip_ratio/high_max": 1.5806871488166507e-05,
+      "clip_ratio/high_mean": 4.739466817227367e-06,
+      "clip_ratio/low_mean": 3.610486896832299e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.084433521711617e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16208.0,
+      "completions/mean_length": 5730.9609375,
+      "completions/mean_terminated_length": 5475.2880859375,
+      "completions/min_length": 433.0,
+      "completions/min_terminated_length": 433.0,
+      "entropy": 0.9486126750707626,
+      "epoch": 0.06623735050597976,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0012298432411625981,
+      "learning_rate": 1e-05,
+      "loss": 0.0208,
+      "num_tokens": 53864049.0,
+      "reward": 0.359375,
+      "reward_std": 0.25460314750671387,
+      "rewards/accuracy_reward/mean": 0.359375,
+      "rewards/accuracy_reward/std": 0.481702595949173,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999348521232605,
+      "sampling/importance_sampling_ratio/min": 4.832820559386164e-05,
+      "sampling/sampling_logp_difference/max": 9.937495231628418,
+      "sampling/sampling_logp_difference/mean": 0.01919996738433838,
+      "step": 72
+    },
+    {
+      "clip_ratio/high_max": 1.2390134997986024e-05,
+      "clip_ratio/high_mean": 3.097533749496506e-06,
+      "clip_ratio/low_mean": 3.8867822581778455e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.19653564449618e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 13500.0,
+      "completions/mean_length": 4620.5703125,
+      "completions/mean_terminated_length": 4527.94482421875,
+      "completions/min_length": 364.0,
+      "completions/min_terminated_length": 364.0,
+      "entropy": 0.9557560831308365,
+      "epoch": 0.0671573137074517,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.002882040338590741,
+      "learning_rate": 1e-05,
+      "loss": 0.0609,
+      "num_tokens": 54473498.0,
+      "reward": 0.3984375,
+      "reward_std": 0.39294686913490295,
+      "rewards/accuracy_reward/mean": 0.3984375,
+      "rewards/accuracy_reward/std": 0.4915000796318054,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998915195465088,
+      "sampling/importance_sampling_ratio/min": 1.577107298089686e-07,
+      "sampling/sampling_logp_difference/max": 15.662503242492676,
+      "sampling/sampling_logp_difference/mean": 0.018525000661611557,
+      "step": 73
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 3.088819471486204e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.088819471486204e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 16314.0,
+      "completions/max_terminated_length": 16314.0,
+      "completions/mean_length": 5074.0703125,
+      "completions/mean_terminated_length": 5074.0703125,
+      "completions/min_length": 342.0,
+      "completions/min_terminated_length": 342.0,
+      "entropy": 0.8830869868397713,
+      "epoch": 0.06807727690892364,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.003324020653963089,
+      "learning_rate": 1e-05,
+      "loss": 0.0305,
+      "num_tokens": 55141787.0,
+      "reward": 0.4609375,
+      "reward_std": 0.30115634202957153,
+      "rewards/accuracy_reward/mean": 0.4609375,
+      "rewards/accuracy_reward/std": 0.5004304051399231,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999203681945801,
+      "sampling/importance_sampling_ratio/min": 0.0009876838885247707,
+      "sampling/sampling_logp_difference/max": 6.920147895812988,
+      "sampling/sampling_logp_difference/mean": 0.018072880804538727,
+      "step": 74
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 1.526649884908693e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 1.526649884908693e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15251.0,
+      "completions/max_terminated_length": 15251.0,
+      "completions/mean_length": 6192.1015625,
+      "completions/mean_terminated_length": 6192.1015625,
+      "completions/min_length": 553.0,
+      "completions/min_terminated_length": 553.0,
+      "entropy": 1.0888547226786613,
+      "epoch": 0.06899724011039558,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0017452294705435634,
+      "learning_rate": 1e-05,
+      "loss": 0.0216,
+      "num_tokens": 55954144.0,
+      "reward": 0.2890625,
+      "reward_std": 0.23250606656074524,
+      "rewards/accuracy_reward/mean": 0.2890625,
+      "rewards/accuracy_reward/std": 0.45510825514793396,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999473690986633,
+      "sampling/importance_sampling_ratio/min": 5.061922365712235e-07,
+      "sampling/sampling_logp_difference/max": 14.496349334716797,
+      "sampling/sampling_logp_difference/mean": 0.021221645176410675,
+      "step": 75
+    },
+    {
+      "clip_ratio/high_max": 1.6768677141953958e-05,
+      "clip_ratio/high_mean": 5.080836899651331e-06,
+      "clip_ratio/low_mean": 3.340929970363504e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.84901372854074e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15740.0,
+      "completions/mean_length": 6204.296875,
+      "completions/mean_terminated_length": 6124.1416015625,
+      "completions/min_length": 294.0,
+      "completions/min_terminated_length": 294.0,
+      "entropy": 1.0423575639724731,
+      "epoch": 0.06991720331186753,
+      "frac_reward_zero_std": 0.125,
+      "grad_norm": 0.0033357341308146715,
+      "learning_rate": 1e-05,
+      "loss": 0.1073,
+      "num_tokens": 56765470.0,
+      "reward": 0.3359375,
+      "reward_std": 0.37875816226005554,
+      "rewards/accuracy_reward/mean": 0.3359375,
+      "rewards/accuracy_reward/std": 0.47417303919792175,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.99998539686203,
+      "sampling/importance_sampling_ratio/min": 4.564182381727733e-05,
+      "sampling/sampling_logp_difference/max": 9.994686126708984,
+      "sampling/sampling_logp_difference/mean": 0.01908688060939312,
+      "step": 76
+    },
+    {
+      "clip_ratio/high_max": 3.149884150843718e-06,
+      "clip_ratio/high_mean": 7.874710377109295e-07,
+      "clip_ratio/low_mean": 2.430614893000893e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.509361991087644e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 14409.0,
+      "completions/max_terminated_length": 14409.0,
+      "completions/mean_length": 5070.3125,
+      "completions/mean_terminated_length": 5070.3125,
+      "completions/min_length": 629.0,
+      "completions/min_terminated_length": 629.0,
+      "entropy": 1.0737399458885193,
+      "epoch": 0.07083716651333946,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0038695367984473705,
+      "learning_rate": 1e-05,
+      "loss": 0.0015,
+      "num_tokens": 57432958.0,
+      "reward": 0.390625,
+      "reward_std": 0.22119548916816711,
+      "rewards/accuracy_reward/mean": 0.390625,
+      "rewards/accuracy_reward/std": 0.4898075461387634,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999223947525024,
+      "sampling/importance_sampling_ratio/min": 1.5509348259001854e-06,
+      "sampling/sampling_logp_difference/max": 13.376652717590332,
+      "sampling/sampling_logp_difference/mean": 0.01970684342086315,
+      "step": 77
+    },
+    {
+      "clip_ratio/high_max": 1.9821940441033803e-05,
+      "clip_ratio/high_mean": 4.955485110258451e-06,
+      "clip_ratio/low_mean": 2.9055729555693688e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.401121466595214e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15799.0,
+      "completions/mean_length": 5750.21875,
+      "completions/mean_terminated_length": 5495.00830078125,
+      "completions/min_length": 76.0,
+      "completions/min_terminated_length": 76.0,
+      "entropy": 0.9708107560873032,
+      "epoch": 0.07175712971481141,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.002927646040916443,
+      "learning_rate": 1e-05,
+      "loss": 0.0166,
+      "num_tokens": 58187426.0,
+      "reward": 0.296875,
+      "reward_std": 0.25460314750671387,
+      "rewards/accuracy_reward/mean": 0.296875,
+      "rewards/accuracy_reward/std": 0.45867621898651123,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999390840530396,
+      "sampling/importance_sampling_ratio/min": 0.015204614959657192,
+      "sampling/sampling_logp_difference/max": 4.186156272888184,
+      "sampling/sampling_logp_difference/mean": 0.019483914598822594,
+      "step": 78
+    },
+    {
+      "clip_ratio/high_max": 2.3815636723156786e-05,
+      "clip_ratio/high_mean": 5.953909180789196e-06,
+      "clip_ratio/low_mean": 4.989707144886779e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.585097960647545e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15938.0,
+      "completions/mean_length": 6067.484375,
+      "completions/mean_terminated_length": 5986.251953125,
+      "completions/min_length": 656.0,
+      "completions/min_terminated_length": 656.0,
+      "entropy": 0.9576351121068001,
+      "epoch": 0.07267709291628335,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.0026169484481215477,
+      "learning_rate": 1e-05,
+      "loss": -0.0055,
+      "num_tokens": 58983336.0,
+      "reward": 0.390625,
+      "reward_std": 0.3406373858451843,
+      "rewards/accuracy_reward/mean": 0.390625,
+      "rewards/accuracy_reward/std": 0.4898075461387634,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999620914459229,
+      "sampling/importance_sampling_ratio/min": 1.974713995878119e-06,
+      "sampling/sampling_logp_difference/max": 13.135087013244629,
+      "sampling/sampling_logp_difference/mean": 0.019007554277777672,
+      "step": 79
+    },
+    {
+      "clip_ratio/high_max": 2.4238934656750644e-05,
+      "clip_ratio/high_mean": 7.786730066072778e-06,
+      "clip_ratio/low_mean": 4.5700241571466904e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.3486972547034384e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 13640.0,
+      "completions/max_terminated_length": 13640.0,
+      "completions/mean_length": 4612.8984375,
+      "completions/mean_terminated_length": 4612.8984375,
+      "completions/min_length": 198.0,
+      "completions/min_terminated_length": 198.0,
+      "entropy": 0.9636320173740387,
+      "epoch": 0.07359705611775529,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0015429699560627341,
+      "learning_rate": 1e-05,
+      "loss": -0.018,
+      "num_tokens": 59590763.0,
+      "reward": 0.421875,
+      "reward_std": 0.34139877557754517,
+      "rewards/accuracy_reward/mean": 0.421875,
+      "rewards/accuracy_reward/std": 0.4957992732524872,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999473094940186,
+      "sampling/importance_sampling_ratio/min": 2.5909587364481013e-08,
+      "sampling/sampling_logp_difference/max": 17.468652725219727,
+      "sampling/sampling_logp_difference/mean": 0.019313856959342957,
+      "step": 80
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 3.0911465842109465e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.0911465842109465e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16300.0,
+      "completions/mean_length": 6101.3125,
+      "completions/mean_terminated_length": 5854.5283203125,
+      "completions/min_length": 179.0,
+      "completions/min_terminated_length": 179.0,
+      "entropy": 0.8831139355897903,
+      "epoch": 0.07451701931922723,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0022505265660583973,
+      "learning_rate": 1e-05,
+      "loss": 0.0813,
+      "num_tokens": 60391283.0,
+      "reward": 0.3125,
+      "reward_std": 0.29302334785461426,
+      "rewards/accuracy_reward/mean": 0.3125,
+      "rewards/accuracy_reward/std": 0.4653336703777313,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000061988830566,
+      "sampling/importance_sampling_ratio/min": 0.0003816343960352242,
+      "sampling/sampling_logp_difference/max": 7.871047496795654,
+      "sampling/sampling_logp_difference/mean": 0.018377842381596565,
+      "step": 81
+    },
+    {
+      "clip_ratio/high_max": 1.547606643725885e-05,
+      "clip_ratio/high_mean": 3.869016609314713e-06,
+      "clip_ratio/low_mean": 2.478705800967873e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.8656074391619768e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14862.0,
+      "completions/mean_length": 4705.9921875,
+      "completions/mean_terminated_length": 4614.03955078125,
+      "completions/min_length": 86.0,
+      "completions/min_terminated_length": 86.0,
+      "entropy": 0.9557913094758987,
+      "epoch": 0.07543698252069918,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.002069958718493581,
+      "learning_rate": 1e-05,
+      "loss": -0.0015,
+      "num_tokens": 61021490.0,
+      "reward": 0.4296875,
+      "reward_std": 0.2637920379638672,
+      "rewards/accuracy_reward/mean": 0.4296875,
+      "rewards/accuracy_reward/std": 0.4969765841960907,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999030232429504,
+      "sampling/importance_sampling_ratio/min": 2.76673017651774e-05,
+      "sampling/sampling_logp_difference/max": 10.495259284973145,
+      "sampling/sampling_logp_difference/mean": 0.018629569560289383,
+      "step": 82
+    },
+    {
+      "clip_ratio/high_max": 2.0910484636260662e-05,
+      "clip_ratio/high_mean": 5.2276211590651656e-06,
+      "clip_ratio/low_mean": 1.952954164607945e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.4757162805144617e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 13745.0,
+      "completions/max_terminated_length": 13745.0,
+      "completions/mean_length": 5116.78125,
+      "completions/mean_terminated_length": 5116.78125,
+      "completions/min_length": 12.0,
+      "completions/min_terminated_length": 12.0,
+      "entropy": 1.0198405236005783,
+      "epoch": 0.07635694572217111,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0034461067989468575,
+      "learning_rate": 1e-05,
+      "loss": -0.0073,
+      "num_tokens": 61695382.0,
+      "reward": 0.265625,
+      "reward_std": 0.30774885416030884,
+      "rewards/accuracy_reward/mean": 0.265625,
+      "rewards/accuracy_reward/std": 0.44340085983276367,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999936819076538,
+      "sampling/importance_sampling_ratio/min": 0.012227212078869343,
+      "sampling/sampling_logp_difference/max": 4.4040913581848145,
+      "sampling/sampling_logp_difference/mean": 0.019400250166654587,
+      "step": 83
+    },
+    {
+      "clip_ratio/high_max": 1.5340228401328204e-05,
+      "clip_ratio/high_mean": 3.835057100332051e-06,
+      "clip_ratio/low_mean": 3.150914017169271e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.534419727202476e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15953.0,
+      "completions/mean_length": 5891.9140625,
+      "completions/mean_terminated_length": 5553.45947265625,
+      "completions/min_length": 79.0,
+      "completions/min_terminated_length": 79.0,
+      "entropy": 0.9568078517913818,
+      "epoch": 0.07727690892364306,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0025854657869786024,
+      "learning_rate": 1e-05,
+      "loss": 0.1013,
+      "num_tokens": 62474883.0,
+      "reward": 0.3203125,
+      "reward_std": 0.31010788679122925,
+      "rewards/accuracy_reward/mean": 0.3203125,
+      "rewards/accuracy_reward/std": 0.4684300124645233,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0001013278961182,
+      "sampling/importance_sampling_ratio/min": 0.0015072470996528864,
+      "sampling/sampling_logp_difference/max": 6.497470378875732,
+      "sampling/sampling_logp_difference/mean": 0.019574139267206192,
+      "step": 84
+    },
+    {
+      "clip_ratio/high_max": 1.108303422370227e-05,
+      "clip_ratio/high_mean": 2.7707585559255676e-06,
+      "clip_ratio/low_mean": 2.2325777763398946e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.5096536319324514e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 13671.0,
+      "completions/mean_length": 5300.3359375,
+      "completions/mean_terminated_length": 5213.06298828125,
+      "completions/min_length": 17.0,
+      "completions/min_terminated_length": 17.0,
+      "entropy": 0.9722280204296112,
+      "epoch": 0.078196872125115,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0025075653102248907,
+      "learning_rate": 1e-05,
+      "loss": 0.0312,
+      "num_tokens": 63172454.0,
+      "reward": 0.203125,
+      "reward_std": 0.21542152762413025,
+      "rewards/accuracy_reward/mean": 0.203125,
+      "rewards/accuracy_reward/std": 0.40390563011169434,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999643564224243,
+      "sampling/importance_sampling_ratio/min": 0.00020346972451079637,
+      "sampling/sampling_logp_difference/max": 8.499993324279785,
+      "sampling/sampling_logp_difference/mean": 0.02002432942390442,
+      "step": 85
+    },
+    {
+      "clip_ratio/high_max": 1.3991947980684927e-05,
+      "clip_ratio/high_mean": 3.4979869951712317e-06,
+      "clip_ratio/low_mean": 4.893367201930232e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.243165958290774e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15617.0,
+      "completions/mean_length": 6364.21875,
+      "completions/mean_terminated_length": 6205.1748046875,
+      "completions/min_length": 215.0,
+      "completions/min_terminated_length": 215.0,
+      "entropy": 1.0607495978474617,
+      "epoch": 0.07911683532658693,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0017982006538659334,
+      "learning_rate": 1e-05,
+      "loss": -0.0117,
+      "num_tokens": 64007602.0,
+      "reward": 0.2890625,
+      "reward_std": 0.21778053045272827,
+      "rewards/accuracy_reward/mean": 0.2890625,
+      "rewards/accuracy_reward/std": 0.45510825514793396,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000064373016357,
+      "sampling/importance_sampling_ratio/min": 3.823801307589747e-05,
+      "sampling/sampling_logp_difference/max": 10.171680450439453,
+      "sampling/sampling_logp_difference/mean": 0.020373597741127014,
+      "step": 86
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 2.6416430046083406e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.6416430046083406e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14709.0,
+      "completions/mean_length": 5746.3125,
+      "completions/mean_terminated_length": 5403.1611328125,
+      "completions/min_length": 287.0,
+      "completions/min_terminated_length": 287.0,
+      "entropy": 0.9913106113672256,
+      "epoch": 0.08003679852805888,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.002207317156717181,
+      "learning_rate": 1e-05,
+      "loss": 0.063,
+      "num_tokens": 64762058.0,
+      "reward": 0.34375,
+      "reward_std": 0.3264310359954834,
+      "rewards/accuracy_reward/mean": 0.34375,
+      "rewards/accuracy_reward/std": 0.47682511806488037,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999239444732666,
+      "sampling/importance_sampling_ratio/min": 5.3444750847120304e-08,
+      "sampling/sampling_logp_difference/max": 16.744617462158203,
+      "sampling/sampling_logp_difference/mean": 0.020608089864253998,
+      "step": 87
+    },
+    {
+      "clip_ratio/high_max": 1.2681661701208213e-05,
+      "clip_ratio/high_mean": 3.1704154253020533e-06,
+      "clip_ratio/low_mean": 3.541917828897567e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.85895939416514e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16292.0,
+      "completions/mean_length": 6088.5625,
+      "completions/mean_terminated_length": 5841.47216796875,
+      "completions/min_length": 159.0,
+      "completions/min_terminated_length": 159.0,
+      "entropy": 0.9040444120764732,
+      "epoch": 0.08095676172953081,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0012974507408216596,
+      "learning_rate": 1e-05,
+      "loss": 0.0401,
+      "num_tokens": 65561002.0,
+      "reward": 0.3671875,
+      "reward_std": 0.2477683573961258,
+      "rewards/accuracy_reward/mean": 0.3671875,
+      "rewards/accuracy_reward/std": 0.4839322865009308,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998487234115601,
+      "sampling/importance_sampling_ratio/min": 6.021501121722395e-06,
+      "sampling/sampling_logp_difference/max": 12.020174026489258,
+      "sampling/sampling_logp_difference/mean": 0.01939838007092476,
+      "step": 88
+    },
+    {
+      "clip_ratio/high_max": 7.807132533343975e-06,
+      "clip_ratio/high_mean": 1.9517831333359936e-06,
+      "clip_ratio/low_mean": 1.8564539345788944e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.05163223654381e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15021.0,
+      "completions/mean_length": 5765.5,
+      "completions/mean_terminated_length": 5510.65625,
+      "completions/min_length": 412.0,
+      "completions/min_terminated_length": 412.0,
+      "entropy": 0.9966336265206337,
+      "epoch": 0.08187672493100276,
+      "frac_reward_zero_std": 0.6875,
+      "grad_norm": 0.0013380619930103421,
+      "learning_rate": 1e-05,
+      "loss": 0.0522,
+      "num_tokens": 66318482.0,
+      "reward": 0.375,
+      "reward_std": 0.13994136452674866,
+      "rewards/accuracy_reward/mean": 0.375,
+      "rewards/accuracy_reward/std": 0.4860251843929291,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999471306800842,
+      "sampling/importance_sampling_ratio/min": 7.288413598871557e-06,
+      "sampling/sampling_logp_difference/max": 11.829224586486816,
+      "sampling/sampling_logp_difference/mean": 0.018109245225787163,
+      "step": 89
+    },
+    {
+      "clip_ratio/high_max": 1.7906912489706883e-05,
+      "clip_ratio/high_mean": 4.476728122426721e-06,
+      "clip_ratio/low_mean": 2.5812531305291486e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.0289259655091882e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16120.0,
+      "completions/mean_length": 5462.78125,
+      "completions/mean_terminated_length": 5200.67236328125,
+      "completions/min_length": 460.0,
+      "completions/min_terminated_length": 460.0,
+      "entropy": 0.9345141425728798,
+      "epoch": 0.0827966881324747,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0023930128663778305,
+      "learning_rate": 1e-05,
+      "loss": 0.0475,
+      "num_tokens": 67038582.0,
+      "reward": 0.46875,
+      "reward_std": 0.24435341358184814,
+      "rewards/accuracy_reward/mean": 0.46875,
+      "rewards/accuracy_reward/std": 0.5009832978248596,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999513030052185,
+      "sampling/importance_sampling_ratio/min": 0.008508839644491673,
+      "sampling/sampling_logp_difference/max": 4.7666497230529785,
+      "sampling/sampling_logp_difference/mean": 0.019220296293497086,
+      "step": 90
+    },
+    {
+      "clip_ratio/high_max": 1.551389118503721e-05,
+      "clip_ratio/high_mean": 3.878472796259302e-06,
+      "clip_ratio/low_mean": 3.239646628117043e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.6274939645863924e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15034.0,
+      "completions/max_terminated_length": 15034.0,
+      "completions/mean_length": 5547.5078125,
+      "completions/mean_terminated_length": 5547.5078125,
+      "completions/min_length": 14.0,
+      "completions/min_terminated_length": 14.0,
+      "entropy": 1.0511749312281609,
+      "epoch": 0.08371665133394664,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.0013633714988827705,
+      "learning_rate": 1e-05,
+      "loss": 0.0462,
+      "num_tokens": 67774487.0,
+      "reward": 0.203125,
+      "reward_std": 0.19438527524471283,
+      "rewards/accuracy_reward/mean": 0.203125,
+      "rewards/accuracy_reward/std": 0.40390563011169434,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999545216560364,
+      "sampling/importance_sampling_ratio/min": 1.0995515367540065e-05,
+      "sampling/sampling_logp_difference/max": 11.418023109436035,
+      "sampling/sampling_logp_difference/mean": 0.020328814163804054,
+      "step": 91
+    },
+    {
+      "clip_ratio/high_max": 1.5384989410449634e-05,
+      "clip_ratio/high_mean": 3.846247352612409e-06,
+      "clip_ratio/low_mean": 3.441604167164769e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.826228908110352e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14029.0,
+      "completions/mean_length": 5835.4140625,
+      "completions/mean_terminated_length": 5406.609375,
+      "completions/min_length": 384.0,
+      "completions/min_terminated_length": 384.0,
+      "entropy": 1.0024723336100578,
+      "epoch": 0.08463661453541858,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0036165034398436546,
+      "learning_rate": 1e-05,
+      "loss": 0.0373,
+      "num_tokens": 68541660.0,
+      "reward": 0.34375,
+      "reward_std": 0.3584783673286438,
+      "rewards/accuracy_reward/mean": 0.34375,
+      "rewards/accuracy_reward/std": 0.47682511806488037,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999669790267944,
+      "sampling/importance_sampling_ratio/min": 9.518130354990717e-06,
+      "sampling/sampling_logp_difference/max": 11.562312126159668,
+      "sampling/sampling_logp_difference/mean": 0.020469525828957558,
+      "step": 92
+    },
+    {
+      "clip_ratio/high_max": 6.105602551542688e-06,
+      "clip_ratio/high_mean": 1.526400637885672e-06,
+      "clip_ratio/low_mean": 5.3129634352444555e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.46560352177039e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15695.0,
+      "completions/mean_length": 6252.609375,
+      "completions/mean_terminated_length": 6172.83447265625,
+      "completions/min_length": 481.0,
+      "completions/min_terminated_length": 481.0,
+      "entropy": 1.0325519517064095,
+      "epoch": 0.08555657773689053,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0022011541295796633,
+      "learning_rate": 1e-05,
+      "loss": 0.036,
+      "num_tokens": 69365418.0,
+      "reward": 0.3828125,
+      "reward_std": 0.32301604747772217,
+      "rewards/accuracy_reward/mean": 0.3828125,
+      "rewards/accuracy_reward/std": 0.4879830479621887,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998809099197388,
+      "sampling/importance_sampling_ratio/min": 0.0005531083443202078,
+      "sampling/sampling_logp_difference/max": 7.4999566078186035,
+      "sampling/sampling_logp_difference/mean": 0.02079072594642639,
+      "step": 93
+    },
+    {
+      "clip_ratio/high_max": 4.348128641140647e-06,
+      "clip_ratio/high_mean": 1.0870321602851618e-06,
+      "clip_ratio/low_mean": 3.0097819148977578e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.118485085451539e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15316.0,
+      "completions/max_terminated_length": 15316.0,
+      "completions/mean_length": 5581.484375,
+      "completions/mean_terminated_length": 5581.484375,
+      "completions/min_length": 461.0,
+      "completions/min_terminated_length": 461.0,
+      "entropy": 0.9222500994801521,
+      "epoch": 0.08647654093836246,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.002300912281498313,
+      "learning_rate": 1e-05,
+      "loss": -0.0007,
+      "num_tokens": 70099320.0,
+      "reward": 0.296875,
+      "reward_std": 0.2959064245223999,
+      "rewards/accuracy_reward/mean": 0.296875,
+      "rewards/accuracy_reward/std": 0.45867621898651123,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998577833175659,
+      "sampling/importance_sampling_ratio/min": 8.140386853483506e-08,
+      "sampling/sampling_logp_difference/max": 16.323843002319336,
+      "sampling/sampling_logp_difference/mean": 0.01952272653579712,
+      "step": 94
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 3.5122252029395895e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.5122252029395895e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15781.0,
+      "completions/max_terminated_length": 15781.0,
+      "completions/mean_length": 5424.140625,
+      "completions/mean_terminated_length": 5424.140625,
+      "completions/min_length": 124.0,
+      "completions/min_terminated_length": 124.0,
+      "entropy": 1.0446564108133316,
+      "epoch": 0.08739650413983441,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0016312639927491546,
+      "learning_rate": 1e-05,
+      "loss": 0.0119,
+      "num_tokens": 70811474.0,
+      "reward": 0.359375,
+      "reward_std": 0.31246691942214966,
+      "rewards/accuracy_reward/mean": 0.359375,
+      "rewards/accuracy_reward/std": 0.481702595949173,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000094175338745,
+      "sampling/importance_sampling_ratio/min": 0.0021919538266956806,
+      "sampling/sampling_logp_difference/max": 6.12296199798584,
+      "sampling/sampling_logp_difference/mean": 0.019741754978895187,
+      "step": 95
+    },
+    {
+      "clip_ratio/high_max": 1.0354576261306647e-05,
+      "clip_ratio/high_mean": 3.496124691082514e-06,
+      "clip_ratio/low_mean": 4.096481598026003e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.446094089871622e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15755.0,
+      "completions/max_terminated_length": 15755.0,
+      "completions/mean_length": 5884.9609375,
+      "completions/mean_terminated_length": 5884.9609375,
+      "completions/min_length": 382.0,
+      "completions/min_terminated_length": 382.0,
+      "entropy": 0.9605691060423851,
+      "epoch": 0.08831646734130635,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0032865386456251144,
+      "learning_rate": 1e-05,
+      "loss": 0.0451,
+      "num_tokens": 71582701.0,
+      "reward": 0.4140625,
+      "reward_std": 0.3514111638069153,
+      "rewards/accuracy_reward/mean": 0.4140625,
+      "rewards/accuracy_reward/std": 0.49449479579925537,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999833106994629,
+      "sampling/importance_sampling_ratio/min": 1.149311810877407e-05,
+      "sampling/sampling_logp_difference/max": 11.373762130737305,
+      "sampling/sampling_logp_difference/mean": 0.019438734278082848,
+      "step": 96
+    },
+    {
+      "clip_ratio/high_max": 1.026998006636859e-05,
+      "clip_ratio/high_mean": 2.5674950165921473e-06,
+      "clip_ratio/low_mean": 3.5440503552308655e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.8007998455213965e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15361.0,
+      "completions/max_terminated_length": 15361.0,
+      "completions/mean_length": 4835.09375,
+      "completions/mean_terminated_length": 4835.09375,
+      "completions/min_length": 826.0,
+      "completions/min_terminated_length": 826.0,
+      "entropy": 0.9038172215223312,
+      "epoch": 0.08923643054277829,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.004721678793430328,
+      "learning_rate": 1e-05,
+      "loss": 0.1143,
+      "num_tokens": 72220025.0,
+      "reward": 0.4765625,
+      "reward_std": 0.38481879234313965,
+      "rewards/accuracy_reward/mean": 0.4765625,
+      "rewards/accuracy_reward/std": 0.5014128684997559,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.99994957447052,
+      "sampling/importance_sampling_ratio/min": 2.710051205667696e-07,
+      "sampling/sampling_logp_difference/max": 15.12112808227539,
+      "sampling/sampling_logp_difference/mean": 0.017888439819216728,
+      "step": 97
+    },
+    {
+      "clip_ratio/high_max": 2.93432283342554e-05,
+      "clip_ratio/high_mean": 9.56252398509605e-06,
+      "clip_ratio/low_mean": 4.7865792453194445e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.742831808674964e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14431.0,
+      "completions/mean_length": 5979.078125,
+      "completions/mean_terminated_length": 5897.1494140625,
+      "completions/min_length": 241.0,
+      "completions/min_terminated_length": 241.0,
+      "entropy": 1.0227951630949974,
+      "epoch": 0.09015639374425023,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0010532280430197716,
+      "learning_rate": 1e-05,
+      "loss": 0.0187,
+      "num_tokens": 73005515.0,
+      "reward": 0.2890625,
+      "reward_std": 0.30115631222724915,
+      "rewards/accuracy_reward/mean": 0.2890625,
+      "rewards/accuracy_reward/std": 0.45510825514793396,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999090433120728,
+      "sampling/importance_sampling_ratio/min": 0.00030157779110595584,
+      "sampling/sampling_logp_difference/max": 8.10648250579834,
+      "sampling/sampling_logp_difference/mean": 0.019633149728178978,
+      "step": 98
+    },
+    {
+      "clip_ratio/high_max": 4.203234766464448e-06,
+      "clip_ratio/high_mean": 1.050808691616112e-06,
+      "clip_ratio/low_mean": 2.5574990331733716e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.6625799137036665e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15886.0,
+      "completions/max_terminated_length": 15886.0,
+      "completions/mean_length": 4292.1796875,
+      "completions/mean_terminated_length": 4292.1796875,
+      "completions/min_length": 159.0,
+      "completions/min_terminated_length": 159.0,
+      "entropy": 0.8719984591007233,
+      "epoch": 0.09107635694572216,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0038324075285345316,
+      "learning_rate": 1e-05,
+      "loss": 0.0669,
+      "num_tokens": 73572794.0,
+      "reward": 0.4375,
+      "reward_std": 0.2972046136856079,
+      "rewards/accuracy_reward/mean": 0.4375,
+      "rewards/accuracy_reward/std": 0.49802759289741516,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999188780784607,
+      "sampling/importance_sampling_ratio/min": 0.015675775706768036,
+      "sampling/sampling_logp_difference/max": 4.155638694763184,
+      "sampling/sampling_logp_difference/mean": 0.018074234947562218,
+      "step": 99
+    },
+    {
+      "clip_ratio/high_max": 4.431366960488958e-06,
+      "clip_ratio/high_mean": 1.1078417401222396e-06,
+      "clip_ratio/low_mean": 4.433405501913512e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.54418968729442e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 14674.0,
+      "completions/max_terminated_length": 14674.0,
+      "completions/mean_length": 5449.2890625,
+      "completions/mean_terminated_length": 5449.2890625,
+      "completions/min_length": 635.0,
+      "completions/min_terminated_length": 635.0,
+      "entropy": 0.9137986451387405,
+      "epoch": 0.09199632014719411,
+      "frac_reward_zero_std": 0.125,
+      "grad_norm": 0.004843447357416153,
+      "learning_rate": 1e-05,
+      "loss": 0.0166,
+      "num_tokens": 74289607.0,
+      "reward": 0.5,
+      "reward_std": 0.40609243512153625,
+      "rewards/accuracy_reward/mean": 0.5,
+      "rewards/accuracy_reward/std": 0.5019646286964417,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999977707862854,
+      "sampling/importance_sampling_ratio/min": 8.851584993863071e-07,
+      "sampling/sampling_logp_difference/max": 13.937499046325684,
+      "sampling/sampling_logp_difference/mean": 0.018183842301368713,
+      "step": 100
+    },
+    {
+      "clip_ratio/high_max": 8.212076863856055e-06,
+      "clip_ratio/high_mean": 2.0530192159640137e-06,
+      "clip_ratio/low_mean": 3.6279372466196946e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.833239122741361e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 16163.0,
+      "completions/max_terminated_length": 16163.0,
+      "completions/mean_length": 4983.3515625,
+      "completions/mean_terminated_length": 4983.3515625,
+      "completions/min_length": 541.0,
+      "completions/min_terminated_length": 541.0,
+      "entropy": 0.9354705810546875,
+      "epoch": 0.09291628334866606,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0037651765160262585,
+      "learning_rate": 1e-05,
+      "loss": 0.0463,
+      "num_tokens": 74946484.0,
+      "reward": 0.3671875,
+      "reward_std": 0.3090519309043884,
+      "rewards/accuracy_reward/mean": 0.3671875,
+      "rewards/accuracy_reward/std": 0.4839322865009308,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999549984931946,
+      "sampling/importance_sampling_ratio/min": 0.00011593531962716952,
+      "sampling/sampling_logp_difference/max": 9.062478065490723,
+      "sampling/sampling_logp_difference/mean": 0.018207306042313576,
+      "step": 101
+    },
+    {
+      "clip_ratio/high_max": 1.3182888324081432e-05,
+      "clip_ratio/high_mean": 3.295722081020358e-06,
+      "clip_ratio/low_mean": 2.544108633628639e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.8736808644680423e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16039.0,
+      "completions/mean_length": 6351.1015625,
+      "completions/mean_terminated_length": 6027.45947265625,
+      "completions/min_length": 804.0,
+      "completions/min_terminated_length": 804.0,
+      "entropy": 0.9310042560100555,
+      "epoch": 0.09383624655013799,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0009160125628113747,
+      "learning_rate": 1e-05,
+      "loss": -0.023,
+      "num_tokens": 75779145.0,
+      "reward": 0.3828125,
+      "reward_std": 0.24329257011413574,
+      "rewards/accuracy_reward/mean": 0.3828125,
+      "rewards/accuracy_reward/std": 0.4879830479621887,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998877048492432,
+      "sampling/importance_sampling_ratio/min": 0.0002961359277833253,
+      "sampling/sampling_logp_difference/max": 8.1246919631958,
+      "sampling/sampling_logp_difference/mean": 0.018513178452849388,
+      "step": 102
+    },
+    {
+      "clip_ratio/high_max": 1.1402620202716207e-05,
+      "clip_ratio/high_mean": 3.935649147024378e-06,
+      "clip_ratio/low_mean": 3.059757568735222e-05,
+      "clip_ratio/low_min": 4.3258582991256844e-06,
+      "clip_ratio/region_mean": 3.45332257438713e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14471.0,
+      "completions/mean_length": 5293.40625,
+      "completions/mean_terminated_length": 4935.64501953125,
+      "completions/min_length": 222.0,
+      "completions/min_terminated_length": 222.0,
+      "entropy": 1.0732879787683487,
+      "epoch": 0.09475620975160993,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0023993055801838636,
+      "learning_rate": 1e-05,
+      "loss": 0.1021,
+      "num_tokens": 76475557.0,
+      "reward": 0.34375,
+      "reward_std": 0.2835350036621094,
+      "rewards/accuracy_reward/mean": 0.34375,
+      "rewards/accuracy_reward/std": 0.47682511806488037,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.000077724456787,
+      "sampling/importance_sampling_ratio/min": 6.613240111619234e-05,
+      "sampling/sampling_logp_difference/max": 9.623851776123047,
+      "sampling/sampling_logp_difference/mean": 0.020792219787836075,
+      "step": 103
+    },
+    {
+      "clip_ratio/high_max": 2.130644793396641e-05,
+      "clip_ratio/high_mean": 8.929533635182452e-06,
+      "clip_ratio/low_mean": 2.663600798769039e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.556554071337814e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16305.0,
+      "completions/mean_length": 7619.7578125,
+      "completions/mean_terminated_length": 7409.41650390625,
+      "completions/min_length": 25.0,
+      "completions/min_terminated_length": 25.0,
+      "entropy": 0.9646238535642624,
+      "epoch": 0.09567617295308188,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0014872358879074454,
+      "learning_rate": 1e-05,
+      "loss": 0.0439,
+      "num_tokens": 77474310.0,
+      "reward": 0.34375,
+      "reward_std": 0.33114904165267944,
+      "rewards/accuracy_reward/mean": 0.34375,
+      "rewards/accuracy_reward/std": 0.47682511806488037,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999638795852661,
+      "sampling/importance_sampling_ratio/min": 0.0016686831368133426,
+      "sampling/sampling_logp_difference/max": 6.395720481872559,
+      "sampling/sampling_logp_difference/mean": 0.020074717700481415,
+      "step": 104
+    },
+    {
+      "clip_ratio/high_max": 1.7765815300663235e-05,
+      "clip_ratio/high_mean": 5.154013138053415e-06,
+      "clip_ratio/low_mean": 5.166909659237717e-05,
+      "clip_ratio/low_min": 8.365680514543783e-06,
+      "clip_ratio/region_mean": 5.68231100714911e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15984.0,
+      "completions/max_terminated_length": 15984.0,
+      "completions/mean_length": 5959.921875,
+      "completions/mean_terminated_length": 5959.921875,
+      "completions/min_length": 55.0,
+      "completions/min_terminated_length": 55.0,
+      "entropy": 1.004471093416214,
+      "epoch": 0.09659613615455381,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.00398358516395092,
+      "learning_rate": 1e-05,
+      "loss": 0.1016,
+      "num_tokens": 78257132.0,
+      "reward": 0.359375,
+      "reward_std": 0.3653082847595215,
+      "rewards/accuracy_reward/mean": 0.359375,
+      "rewards/accuracy_reward/std": 0.481702595949173,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000170469284058,
+      "sampling/importance_sampling_ratio/min": 0.0030075267422944307,
+      "sampling/sampling_logp_difference/max": 5.806637287139893,
+      "sampling/sampling_logp_difference/mean": 0.020755283534526825,
+      "step": 105
+    },
+    {
+      "clip_ratio/high_max": 1.6946955838648137e-05,
+      "clip_ratio/high_mean": 4.236738959662034e-06,
+      "clip_ratio/low_mean": 4.510891039899434e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.934564867653535e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 13736.0,
+      "completions/mean_length": 5427.03125,
+      "completions/mean_terminated_length": 5340.755859375,
+      "completions/min_length": 556.0,
+      "completions/min_terminated_length": 556.0,
+      "entropy": 0.9117375314235687,
+      "epoch": 0.09751609935602576,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0019883522763848305,
+      "learning_rate": 1e-05,
+      "loss": 0.01,
+      "num_tokens": 78971072.0,
+      "reward": 0.375,
+      "reward_std": 0.31694266200065613,
+      "rewards/accuracy_reward/mean": 0.375,
+      "rewards/accuracy_reward/std": 0.4860251843929291,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000550746917725,
+      "sampling/importance_sampling_ratio/min": 0.0008046010043472052,
+      "sampling/sampling_logp_difference/max": 7.125164031982422,
+      "sampling/sampling_logp_difference/mean": 0.018812140449881554,
+      "step": 106
+    },
+    {
+      "clip_ratio/high_max": 2.968176841022796e-05,
+      "clip_ratio/high_mean": 7.42044210255699e-06,
+      "clip_ratio/low_mean": 3.220799408154562e-05,
+      "clip_ratio/low_min": 5.315981979947537e-06,
+      "clip_ratio/region_mean": 3.962843629778945e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 16293.0,
+      "completions/max_terminated_length": 16293.0,
+      "completions/mean_length": 6062.078125,
+      "completions/mean_terminated_length": 6062.078125,
+      "completions/min_length": 108.0,
+      "completions/min_terminated_length": 108.0,
+      "entropy": 1.0164100378751755,
+      "epoch": 0.0984360625574977,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.00450351694598794,
+      "learning_rate": 1e-05,
+      "loss": 0.0426,
+      "num_tokens": 79764434.0,
+      "reward": 0.2578125,
+      "reward_std": 0.26355957984924316,
+      "rewards/accuracy_reward/mean": 0.2578125,
+      "rewards/accuracy_reward/std": 0.43914902210235596,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999713897705078,
+      "sampling/importance_sampling_ratio/min": 0.0007411236292682588,
+      "sampling/sampling_logp_difference/max": 7.207343101501465,
+      "sampling/sampling_logp_difference/mean": 0.020526543259620667,
+      "step": 107
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 4.856050622947805e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.856050622947805e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 13689.0,
+      "completions/max_terminated_length": 13689.0,
+      "completions/mean_length": 4856.53125,
+      "completions/mean_terminated_length": 4856.53125,
+      "completions/min_length": 191.0,
+      "completions/min_terminated_length": 191.0,
+      "entropy": 1.0780886858701706,
+      "epoch": 0.09935602575896964,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.0033157530706375837,
+      "learning_rate": 1e-05,
+      "loss": 0.046,
+      "num_tokens": 80405238.0,
+      "reward": 0.3359375,
+      "reward_std": 0.3487703502178192,
+      "rewards/accuracy_reward/mean": 0.3359375,
+      "rewards/accuracy_reward/std": 0.47417303919792175,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999889135360718,
+      "sampling/importance_sampling_ratio/min": 0.033773623406887054,
+      "sampling/sampling_logp_difference/max": 3.7256407737731934,
+      "sampling/sampling_logp_difference/mean": 0.019188418984413147,
+      "step": 108
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 1.975351790406421e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 1.975351790406421e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 16335.0,
+      "completions/max_terminated_length": 16335.0,
+      "completions/mean_length": 3930.5859375,
+      "completions/mean_terminated_length": 3930.5859375,
+      "completions/min_length": 4.0,
+      "completions/min_terminated_length": 4.0,
+      "entropy": 0.8666863515973091,
+      "epoch": 0.10027598896044158,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.005471619311720133,
+      "learning_rate": 1e-05,
+      "loss": -0.0779,
+      "num_tokens": 80926721.0,
+      "reward": 0.5859375,
+      "reward_std": 0.3164186179637909,
+      "rewards/accuracy_reward/mean": 0.5859375,
+      "rewards/accuracy_reward/std": 0.49449479579925537,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000040531158447,
+      "sampling/importance_sampling_ratio/min": 0.0002562212466727942,
+      "sampling/sampling_logp_difference/max": 8.269469261169434,
+      "sampling/sampling_logp_difference/mean": 0.017708823084831238,
+      "step": 109
+    },
+    {
+      "clip_ratio/high_max": 6.743997801095247e-06,
+      "clip_ratio/high_mean": 1.6859994502738118e-06,
+      "clip_ratio/low_mean": 3.61007656692891e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.7786765119562915e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15546.0,
+      "completions/mean_length": 5934.9453125,
+      "completions/mean_terminated_length": 5684.16845703125,
+      "completions/min_length": 229.0,
+      "completions/min_terminated_length": 229.0,
+      "entropy": 0.9991667941212654,
+      "epoch": 0.10119595216191353,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.002580739092081785,
+      "learning_rate": 1e-05,
+      "loss": -0.0065,
+      "num_tokens": 81707978.0,
+      "reward": 0.3046875,
+      "reward_std": 0.24671243131160736,
+      "rewards/accuracy_reward/mean": 0.3046875,
+      "rewards/accuracy_reward/std": 0.46208351850509644,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000852346420288,
+      "sampling/importance_sampling_ratio/min": 0.002478762762621045,
+      "sampling/sampling_logp_difference/max": 5.999995708465576,
+      "sampling/sampling_logp_difference/mean": 0.019801246002316475,
+      "step": 110
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 2.43532002741631e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.43532002741631e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16010.0,
+      "completions/mean_length": 5866.84375,
+      "completions/mean_terminated_length": 5699.9052734375,
+      "completions/min_length": 499.0,
+      "completions/min_terminated_length": 499.0,
+      "entropy": 0.9848997294902802,
+      "epoch": 0.10211591536338546,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0010949905263260007,
+      "learning_rate": 1e-05,
+      "loss": 0.0266,
+      "num_tokens": 82477310.0,
+      "reward": 0.2734375,
+      "reward_std": 0.26933354139328003,
+      "rewards/accuracy_reward/mean": 0.2734375,
+      "rewards/accuracy_reward/std": 0.447474867105484,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999667406082153,
+      "sampling/importance_sampling_ratio/min": 9.04304688447155e-05,
+      "sampling/sampling_logp_difference/max": 9.310929298400879,
+      "sampling/sampling_logp_difference/mean": 0.020769795402884483,
+      "step": 111
+    },
+    {
+      "clip_ratio/high_max": 1.9307613456476247e-05,
+      "clip_ratio/high_mean": 4.826903364119062e-06,
+      "clip_ratio/low_mean": 5.842190330440644e-05,
+      "clip_ratio/low_min": 1.2287753634154797e-05,
+      "clip_ratio/region_mean": 6.324880496322294e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 14501.0,
+      "completions/max_terminated_length": 14501.0,
+      "completions/mean_length": 6613.7578125,
+      "completions/mean_terminated_length": 6613.7578125,
+      "completions/min_length": 1033.0,
+      "completions/min_terminated_length": 1033.0,
+      "entropy": 0.9176012054085732,
+      "epoch": 0.10303587856485741,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0020384234376251698,
+      "learning_rate": 1e-05,
+      "loss": 0.0571,
+      "num_tokens": 83345055.0,
+      "reward": 0.3671875,
+      "reward_std": 0.2845958471298218,
+      "rewards/accuracy_reward/mean": 0.3671875,
+      "rewards/accuracy_reward/std": 0.4839322865009308,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999457001686096,
+      "sampling/importance_sampling_ratio/min": 0.029541675001382828,
+      "sampling/sampling_logp_difference/max": 3.5219533443450928,
+      "sampling/sampling_logp_difference/mean": 0.018883168697357178,
+      "step": 112
+    },
+    {
+      "clip_ratio/high_max": 1.382043183184578e-05,
+      "clip_ratio/high_mean": 3.455107957961445e-06,
+      "clip_ratio/low_mean": 5.789885449303256e-05,
+      "clip_ratio/low_min": 1.017130716718384e-05,
+      "clip_ratio/region_mean": 6.135396188255982e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16310.0,
+      "completions/mean_length": 6392.3125,
+      "completions/mean_terminated_length": 6070.0,
+      "completions/min_length": 507.0,
+      "completions/min_terminated_length": 507.0,
+      "entropy": 0.904954232275486,
+      "epoch": 0.10395584176632934,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0031166900880634785,
+      "learning_rate": 1e-05,
+      "loss": 0.0351,
+      "num_tokens": 84186343.0,
+      "reward": 0.390625,
+      "reward_std": 0.34139877557754517,
+      "rewards/accuracy_reward/mean": 0.390625,
+      "rewards/accuracy_reward/std": 0.4898075461387634,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999208450317383,
+      "sampling/importance_sampling_ratio/min": 0.00022529886336997151,
+      "sampling/sampling_logp_difference/max": 8.398082733154297,
+      "sampling/sampling_logp_difference/mean": 0.01931958645582199,
+      "step": 113
+    },
+    {
+      "clip_ratio/high_max": 1.7221671441802755e-05,
+      "clip_ratio/high_mean": 6.549099907715572e-06,
+      "clip_ratio/low_mean": 3.147818074467068e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.802728065238625e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16180.0,
+      "completions/mean_length": 5982.703125,
+      "completions/mean_terminated_length": 5817.603515625,
+      "completions/min_length": 294.0,
+      "completions/min_terminated_length": 294.0,
+      "entropy": 0.8394555225968361,
+      "epoch": 0.10487580496780129,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0022041688207536936,
+      "learning_rate": 1e-05,
+      "loss": 0.1043,
+      "num_tokens": 84971129.0,
+      "reward": 0.3125,
+      "reward_std": 0.30774885416030884,
+      "rewards/accuracy_reward/mean": 0.3125,
+      "rewards/accuracy_reward/std": 0.4653336703777313,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999030828475952,
+      "sampling/importance_sampling_ratio/min": 1.553593506287143e-06,
+      "sampling/sampling_logp_difference/max": 13.374939918518066,
+      "sampling/sampling_logp_difference/mean": 0.01795877143740654,
+      "step": 114
+    },
+    {
+      "clip_ratio/high_max": 2.9651660042873118e-05,
+      "clip_ratio/high_mean": 9.398806923854863e-06,
+      "clip_ratio/low_mean": 4.788733849636628e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.728614519284747e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14988.0,
+      "completions/mean_length": 4976.921875,
+      "completions/mean_terminated_length": 4608.95166015625,
+      "completions/min_length": 335.0,
+      "completions/min_terminated_length": 335.0,
+      "entropy": 0.8381234556436539,
+      "epoch": 0.10579576816927323,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.0037972736172378063,
+      "learning_rate": 1e-05,
+      "loss": 0.1244,
+      "num_tokens": 85625559.0,
+      "reward": 0.4765625,
+      "reward_std": 0.39082521200180054,
+      "rewards/accuracy_reward/mean": 0.4765625,
+      "rewards/accuracy_reward/std": 0.5014128684997559,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999970555305481,
+      "sampling/importance_sampling_ratio/min": 0.002990707289427519,
+      "sampling/sampling_logp_difference/max": 5.8122453689575195,
+      "sampling/sampling_logp_difference/mean": 0.01815030723810196,
+      "step": 115
+    },
+    {
+      "clip_ratio/high_max": 4.130592969886493e-06,
+      "clip_ratio/high_mean": 1.0326482424716232e-06,
+      "clip_ratio/low_mean": 1.6904315600640984e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 1.7936963843112608e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15984.0,
+      "completions/mean_length": 6307.2421875,
+      "completions/mean_terminated_length": 6065.400390625,
+      "completions/min_length": 823.0,
+      "completions/min_terminated_length": 823.0,
+      "entropy": 1.1176434755325317,
+      "epoch": 0.10671573137074516,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0012413962977007031,
+      "learning_rate": 1e-05,
+      "loss": 0.0146,
+      "num_tokens": 86453606.0,
+      "reward": 0.28125,
+      "reward_std": 0.2280253767967224,
+      "rewards/accuracy_reward/mean": 0.28125,
+      "rewards/accuracy_reward/std": 0.4513758420944214,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000064373016357,
+      "sampling/importance_sampling_ratio/min": 0.004730688873678446,
+      "sampling/sampling_logp_difference/max": 5.353684425354004,
+      "sampling/sampling_logp_difference/mean": 0.021790307015180588,
+      "step": 116
+    },
+    {
+      "clip_ratio/high_max": 1.3160772823539446e-05,
+      "clip_ratio/high_mean": 3.2901932058848615e-06,
+      "clip_ratio/low_mean": 3.582628983167524e-05,
+      "clip_ratio/low_min": 2.61966624748311e-06,
+      "clip_ratio/region_mean": 3.911648195753514e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16329.0,
+      "completions/mean_length": 7263.1640625,
+      "completions/mean_terminated_length": 7044.26416015625,
+      "completions/min_length": 48.0,
+      "completions/min_terminated_length": 48.0,
+      "entropy": 1.107876107096672,
+      "epoch": 0.10763569457221711,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0017762042116373777,
+      "learning_rate": 1e-05,
+      "loss": 0.0349,
+      "num_tokens": 87402763.0,
+      "reward": 0.2578125,
+      "reward_std": 0.27776598930358887,
+      "rewards/accuracy_reward/mean": 0.2578125,
+      "rewards/accuracy_reward/std": 0.43914902210235596,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999741315841675,
+      "sampling/importance_sampling_ratio/min": 0.0009408573969267309,
+      "sampling/sampling_logp_difference/max": 6.968719005584717,
+      "sampling/sampling_logp_difference/mean": 0.02103034406900406,
+      "step": 117
+    },
+    {
+      "clip_ratio/high_max": 3.987745776612428e-05,
+      "clip_ratio/high_mean": 1.1877163728968299e-05,
+      "clip_ratio/low_mean": 4.26799579145154e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.455712096136267e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15416.0,
+      "completions/mean_length": 5093.859375,
+      "completions/mean_terminated_length": 4914.65087890625,
+      "completions/min_length": 364.0,
+      "completions/min_terminated_length": 364.0,
+      "entropy": 1.1065888702869415,
+      "epoch": 0.10855565777368906,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0032127038575708866,
+      "learning_rate": 1e-05,
+      "loss": 0.0194,
+      "num_tokens": 88077385.0,
+      "reward": 0.421875,
+      "reward_std": 0.345874547958374,
+      "rewards/accuracy_reward/mean": 0.421875,
+      "rewards/accuracy_reward/std": 0.4957992732524872,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999171495437622,
+      "sampling/importance_sampling_ratio/min": 7.033879228401929e-05,
+      "sampling/sampling_logp_difference/max": 9.562187194824219,
+      "sampling/sampling_logp_difference/mean": 0.020314980298280716,
+      "step": 118
+    },
+    {
+      "clip_ratio/high_max": 9.35208754526684e-06,
+      "clip_ratio/high_mean": 4.4788730519940145e-06,
+      "clip_ratio/low_mean": 3.470697703278347e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.918584917528278e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15740.0,
+      "completions/mean_length": 6943.53125,
+      "completions/mean_terminated_length": 6639.0,
+      "completions/min_length": 307.0,
+      "completions/min_terminated_length": 307.0,
+      "entropy": 0.9009081721305847,
+      "epoch": 0.10947562097516099,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0028925195802003145,
+      "learning_rate": 1e-05,
+      "loss": 0.0862,
+      "num_tokens": 88985269.0,
+      "reward": 0.3984375,
+      "reward_std": 0.3535328209400177,
+      "rewards/accuracy_reward/mean": 0.3984375,
+      "rewards/accuracy_reward/std": 0.4915000796318054,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999980628490448,
+      "sampling/importance_sampling_ratio/min": 6.553035092338177e-08,
+      "sampling/sampling_logp_difference/max": 16.540752410888672,
+      "sampling/sampling_logp_difference/mean": 0.019378282129764557,
+      "step": 119
+    },
+    {
+      "clip_ratio/high_max": 1.0939961612166371e-05,
+      "clip_ratio/high_mean": 2.734990403041593e-06,
+      "clip_ratio/low_mean": 2.4615862798782473e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.7350853201824066e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15148.0,
+      "completions/max_terminated_length": 15148.0,
+      "completions/mean_length": 4976.25,
+      "completions/mean_terminated_length": 4976.25,
+      "completions/min_length": 702.0,
+      "completions/min_terminated_length": 702.0,
+      "entropy": 0.9463540017604828,
+      "epoch": 0.11039558417663294,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0017386430408805609,
+      "learning_rate": 1e-05,
+      "loss": 0.0215,
+      "num_tokens": 89645205.0,
+      "reward": 0.359375,
+      "reward_std": 0.26462042331695557,
+      "rewards/accuracy_reward/mean": 0.359375,
+      "rewards/accuracy_reward/std": 0.481702595949173,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999554753303528,
+      "sampling/importance_sampling_ratio/min": 7.889595508459024e-06,
+      "sampling/sampling_logp_difference/max": 11.74996566772461,
+      "sampling/sampling_logp_difference/mean": 0.018035830929875374,
+      "step": 120
+    },
+    {
+      "clip_ratio/high_max": 5.941629297012696e-06,
+      "clip_ratio/high_mean": 1.485407324253174e-06,
+      "clip_ratio/low_mean": 2.6826061798601586e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.8311469009167922e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15892.0,
+      "completions/mean_length": 6439.5390625,
+      "completions/mean_terminated_length": 6281.69091796875,
+      "completions/min_length": 959.0,
+      "completions/min_terminated_length": 959.0,
+      "entropy": 0.899876207113266,
+      "epoch": 0.11131554737810488,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0037381781730800867,
+      "learning_rate": 1e-05,
+      "loss": 0.0666,
+      "num_tokens": 90489394.0,
+      "reward": 0.3203125,
+      "reward_std": 0.2624938488006592,
+      "rewards/accuracy_reward/mean": 0.3203125,
+      "rewards/accuracy_reward/std": 0.4684300124645233,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999206066131592,
+      "sampling/importance_sampling_ratio/min": 0.003606764366850257,
+      "sampling/sampling_logp_difference/max": 5.62494421005249,
+      "sampling/sampling_logp_difference/mean": 0.019368179142475128,
+      "step": 121
+    },
+    {
+      "clip_ratio/high_max": 5.189952389628161e-06,
+      "clip_ratio/high_mean": 1.2974880974070402e-06,
+      "clip_ratio/low_mean": 3.058137212974543e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.187886022715247e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15979.0,
+      "completions/mean_length": 6876.46875,
+      "completions/mean_terminated_length": 6408.884765625,
+      "completions/min_length": 4.0,
+      "completions/min_terminated_length": 4.0,
+      "entropy": 1.1018569767475128,
+      "epoch": 0.11223551057957681,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0018562980694696307,
+      "learning_rate": 1e-05,
+      "loss": 0.095,
+      "num_tokens": 91390054.0,
+      "reward": 0.21875,
+      "reward_std": 0.29955869913101196,
+      "rewards/accuracy_reward/mean": 0.21875,
+      "rewards/accuracy_reward/std": 0.41502299904823303,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999849796295166,
+      "sampling/importance_sampling_ratio/min": 2.9343695132411085e-05,
+      "sampling/sampling_logp_difference/max": 10.436432838439941,
+      "sampling/sampling_logp_difference/mean": 0.020825792104005814,
+      "step": 122
+    },
+    {
+      "clip_ratio/high_max": 2.022083435804234e-05,
+      "clip_ratio/high_mean": 5.055208589510585e-06,
+      "clip_ratio/low_mean": 3.029032552603894e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.53455343429232e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14153.0,
+      "completions/mean_length": 6501.5078125,
+      "completions/mean_terminated_length": 6344.64306640625,
+      "completions/min_length": 720.0,
+      "completions/min_terminated_length": 720.0,
+      "entropy": 1.073579266667366,
+      "epoch": 0.11315547378104876,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0016695430967956781,
+      "learning_rate": 1e-05,
+      "loss": 0.0552,
+      "num_tokens": 92241535.0,
+      "reward": 0.2734375,
+      "reward_std": 0.28641316294670105,
+      "rewards/accuracy_reward/mean": 0.2734375,
+      "rewards/accuracy_reward/std": 0.447474867105484,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998984336853027,
+      "sampling/importance_sampling_ratio/min": 0.0002380236255703494,
+      "sampling/sampling_logp_difference/max": 8.343140602111816,
+      "sampling/sampling_logp_difference/mean": 0.020438479259610176,
+      "step": 123
+    },
+    {
+      "clip_ratio/high_max": 3.3911180707946187e-06,
+      "clip_ratio/high_mean": 8.477795176986547e-07,
+      "clip_ratio/low_mean": 2.2190370486896427e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.30381500614385e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 14345.0,
+      "completions/max_terminated_length": 14345.0,
+      "completions/mean_length": 5474.1328125,
+      "completions/mean_terminated_length": 5474.1328125,
+      "completions/min_length": 14.0,
+      "completions/min_terminated_length": 14.0,
+      "entropy": 1.0692576617002487,
+      "epoch": 0.1140754369825207,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0034909825772047043,
+      "learning_rate": 1e-05,
+      "loss": 0.0,
+      "num_tokens": 92962472.0,
+      "reward": 0.3046875,
+      "reward_std": 0.27564430236816406,
+      "rewards/accuracy_reward/mean": 0.3046875,
+      "rewards/accuracy_reward/std": 0.46208351850509644,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.000006079673767,
+      "sampling/importance_sampling_ratio/min": 0.0017851731972768903,
+      "sampling/sampling_logp_difference/max": 6.328239917755127,
+      "sampling/sampling_logp_difference/mean": 0.019930578768253326,
+      "step": 124
+    },
+    {
+      "clip_ratio/high_max": 2.6292200345778838e-05,
+      "clip_ratio/high_mean": 7.620442374900449e-06,
+      "clip_ratio/low_mean": 4.615546390596137e-05,
+      "clip_ratio/low_min": 1.366510537081922e-05,
+      "clip_ratio/region_mean": 5.3775906508235494e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16198.0,
+      "completions/mean_length": 7512.078125,
+      "completions/mean_terminated_length": 7225.88671875,
+      "completions/min_length": 486.0,
+      "completions/min_terminated_length": 486.0,
+      "entropy": 0.9676955863833427,
+      "epoch": 0.11499540018399264,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0023449272848665714,
+      "learning_rate": 1e-05,
+      "loss": 0.0454,
+      "num_tokens": 93950506.0,
+      "reward": 0.3203125,
+      "reward_std": 0.22461043298244476,
+      "rewards/accuracy_reward/mean": 0.3203125,
+      "rewards/accuracy_reward/std": 0.4684300124645233,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999359250068665,
+      "sampling/importance_sampling_ratio/min": 0.0016406332142651081,
+      "sampling/sampling_logp_difference/max": 6.412672996520996,
+      "sampling/sampling_logp_difference/mean": 0.020141655579209328,
+      "step": 125
+    },
+    {
+      "clip_ratio/high_max": 5.097255780128762e-06,
+      "clip_ratio/high_mean": 1.2743139450321905e-06,
+      "clip_ratio/low_mean": 3.3802551342887455e-05,
+      "clip_ratio/low_min": 4.146762421441963e-06,
+      "clip_ratio/region_mean": 3.5076865287919645e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16183.0,
+      "completions/mean_length": 6920.484375,
+      "completions/mean_terminated_length": 6693.3603515625,
+      "completions/min_length": 962.0,
+      "completions/min_terminated_length": 962.0,
+      "entropy": 0.8662540689110756,
+      "epoch": 0.11591536338546458,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0037103090435266495,
+      "learning_rate": 1e-05,
+      "loss": 0.0617,
+      "num_tokens": 94854016.0,
+      "reward": 0.4375,
+      "reward_std": 0.322716623544693,
+      "rewards/accuracy_reward/mean": 0.4375,
+      "rewards/accuracy_reward/std": 0.49802759289741516,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999761581420898,
+      "sampling/importance_sampling_ratio/min": 0.00047686786274425685,
+      "sampling/sampling_logp_difference/max": 7.648271083831787,
+      "sampling/sampling_logp_difference/mean": 0.01915796287357807,
+      "step": 126
+    },
+    {
+      "clip_ratio/high_max": 8.4922439782531e-06,
+      "clip_ratio/high_mean": 2.123060994563275e-06,
+      "clip_ratio/low_mean": 5.024227584726759e-05,
+      "clip_ratio/low_min": 1.3627016414829995e-05,
+      "clip_ratio/region_mean": 5.236533706920454e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15867.0,
+      "completions/mean_length": 7939.609375,
+      "completions/mean_terminated_length": 7805.57177734375,
+      "completions/min_length": 1260.0,
+      "completions/min_terminated_length": 1260.0,
+      "entropy": 0.9707008600234985,
+      "epoch": 0.11683532658693652,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0024642283096909523,
+      "learning_rate": 1e-05,
+      "loss": 0.0788,
+      "num_tokens": 95889966.0,
+      "reward": 0.2265625,
+      "reward_std": 0.27434611320495605,
+      "rewards/accuracy_reward/mean": 0.2265625,
+      "rewards/accuracy_reward/std": 0.4202519655227661,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998771548271179,
+      "sampling/importance_sampling_ratio/min": 4.540014560916461e-05,
+      "sampling/sampling_logp_difference/max": 9.999995231628418,
+      "sampling/sampling_logp_difference/mean": 0.020453302189707756,
+      "step": 127
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 3.766829564710861e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.766829564710861e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14969.0,
+      "completions/mean_length": 5985.8203125,
+      "completions/mean_terminated_length": 5474.43408203125,
+      "completions/min_length": 315.0,
+      "completions/min_terminated_length": 315.0,
+      "entropy": 0.9083090648055077,
+      "epoch": 0.11775528978840846,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.003317479742690921,
+      "learning_rate": 1e-05,
+      "loss": 0.0537,
+      "num_tokens": 96676847.0,
+      "reward": 0.3671875,
+      "reward_std": 0.287486732006073,
+      "rewards/accuracy_reward/mean": 0.3671875,
+      "rewards/accuracy_reward/std": 0.4839322865009308,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999130964279175,
+      "sampling/importance_sampling_ratio/min": 0.000286750087980181,
+      "sampling/sampling_logp_difference/max": 8.156899452209473,
+      "sampling/sampling_logp_difference/mean": 0.01996719278395176,
+      "step": 128
+    },
+    {
+      "clip_ratio/high_max": 1.8439853647578275e-05,
+      "clip_ratio/high_mean": 4.609963411894569e-06,
+      "clip_ratio/low_mean": 5.708034223061986e-05,
+      "clip_ratio/low_min": 2.75287948170444e-06,
+      "clip_ratio/region_mean": 6.169030598357494e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15081.0,
+      "completions/mean_length": 6565.359375,
+      "completions/mean_terminated_length": 6488.04736328125,
+      "completions/min_length": 248.0,
+      "completions/min_terminated_length": 248.0,
+      "entropy": 1.1013468354940414,
+      "epoch": 0.11867525298988041,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0019073591101914644,
+      "learning_rate": 1e-05,
+      "loss": 0.0622,
+      "num_tokens": 97539453.0,
+      "reward": 0.2734375,
+      "reward_std": 0.307217001914978,
+      "rewards/accuracy_reward/mean": 0.2734375,
+      "rewards/accuracy_reward/std": 0.447474867105484,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999555945396423,
+      "sampling/importance_sampling_ratio/min": 0.0006022047018632293,
+      "sampling/sampling_logp_difference/max": 7.414913177490234,
+      "sampling/sampling_logp_difference/mean": 0.02150837704539299,
+      "step": 129
+    },
+    {
+      "clip_ratio/high_max": 9.068485269381199e-06,
+      "clip_ratio/high_mean": 2.2671213173452998e-06,
+      "clip_ratio/low_mean": 1.9822365402433206e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.208948649240483e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16099.0,
+      "completions/mean_length": 6779.6171875,
+      "completions/mean_terminated_length": 6703.9921875,
+      "completions/min_length": 4.0,
+      "completions/min_terminated_length": 4.0,
+      "entropy": 0.8940552547574043,
+      "epoch": 0.11959521619135234,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0010163087863475084,
+      "learning_rate": 1e-05,
+      "loss": 0.0249,
+      "num_tokens": 98429036.0,
+      "reward": 0.453125,
+      "reward_std": 0.22567126154899597,
+      "rewards/accuracy_reward/mean": 0.453125,
+      "rewards/accuracy_reward/std": 0.4997538626194,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999485015869141,
+      "sampling/importance_sampling_ratio/min": 3.464699460664633e-08,
+      "sampling/sampling_logp_difference/max": 17.178054809570312,
+      "sampling/sampling_logp_difference/mean": 0.018716152757406235,
+      "step": 130
+    },
+    {
+      "clip_ratio/high_max": 5.047242211730918e-06,
+      "clip_ratio/high_mean": 1.2618105529327295e-06,
+      "clip_ratio/low_mean": 2.9014110396019532e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.0275920835265424e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 14549.0,
+      "completions/max_terminated_length": 14549.0,
+      "completions/mean_length": 5766.71875,
+      "completions/mean_terminated_length": 5766.71875,
+      "completions/min_length": 47.0,
+      "completions/min_terminated_length": 47.0,
+      "entropy": 1.0455922111868858,
+      "epoch": 0.12051517939282429,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.002155766822397709,
+      "learning_rate": 1e-05,
+      "loss": 0.0238,
+      "num_tokens": 99184264.0,
+      "reward": 0.4140625,
+      "reward_std": 0.3077537715435028,
+      "rewards/accuracy_reward/mean": 0.4140625,
+      "rewards/accuracy_reward/std": 0.49449479579925537,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999253749847412,
+      "sampling/importance_sampling_ratio/min": 0.00010798005678225309,
+      "sampling/sampling_logp_difference/max": 9.133563995361328,
+      "sampling/sampling_logp_difference/mean": 0.020948775112628937,
+      "step": 131
+    },
+    {
+      "clip_ratio/high_max": 2.0882574972347356e-05,
+      "clip_ratio/high_mean": 6.505383225885453e-06,
+      "clip_ratio/low_mean": 4.496008500609605e-05,
+      "clip_ratio/low_min": 7.757854064038838e-06,
+      "clip_ratio/region_mean": 5.1465468231981504e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14704.0,
+      "completions/mean_length": 6167.2421875,
+      "completions/mean_terminated_length": 6005.07177734375,
+      "completions/min_length": 218.0,
+      "completions/min_terminated_length": 218.0,
+      "entropy": 0.9100174158811569,
+      "epoch": 0.12143514259429623,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.0021464223973453045,
+      "learning_rate": 1e-05,
+      "loss": -0.0279,
+      "num_tokens": 99996831.0,
+      "reward": 0.421875,
+      "reward_std": 0.3916535973548889,
+      "rewards/accuracy_reward/mean": 0.421875,
+      "rewards/accuracy_reward/std": 0.4957992732524872,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999240040779114,
+      "sampling/importance_sampling_ratio/min": 0.02249590866267681,
+      "sampling/sampling_logp_difference/max": 3.794421911239624,
+      "sampling/sampling_logp_difference/mean": 0.01866895705461502,
+      "step": 132
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 3.0998018473837874e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.0998018473837874e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15738.0,
+      "completions/mean_length": 6242.9453125,
+      "completions/mean_terminated_length": 6163.09423828125,
+      "completions/min_length": 1187.0,
+      "completions/min_terminated_length": 1187.0,
+      "entropy": 0.8624134212732315,
+      "epoch": 0.12235510579576817,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0023277695290744305,
+      "learning_rate": 1e-05,
+      "loss": 0.0524,
+      "num_tokens": 100814112.0,
+      "reward": 0.3984375,
+      "reward_std": 0.2869499623775482,
+      "rewards/accuracy_reward/mean": 0.3984375,
+      "rewards/accuracy_reward/std": 0.4915000796318054,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999959409236908,
+      "sampling/importance_sampling_ratio/min": 0.0002393616596236825,
+      "sampling/sampling_logp_difference/max": 8.33753490447998,
+      "sampling/sampling_logp_difference/mean": 0.0191188994795084,
+      "step": 133
+    },
+    {
+      "clip_ratio/high_max": 6.589872555196052e-06,
+      "clip_ratio/high_mean": 1.647468138799013e-06,
+      "clip_ratio/low_mean": 4.329304238126497e-05,
+      "clip_ratio/low_min": 3.5120251595799346e-06,
+      "clip_ratio/region_mean": 4.494051017900347e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14866.0,
+      "completions/mean_length": 5733.6875,
+      "completions/mean_terminated_length": 5478.080078125,
+      "completions/min_length": 789.0,
+      "completions/min_terminated_length": 789.0,
+      "entropy": 0.9628067463636398,
+      "epoch": 0.12327506899724011,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.003547821193933487,
+      "learning_rate": 1e-05,
+      "loss": 0.0321,
+      "num_tokens": 101566264.0,
+      "reward": 0.3984375,
+      "reward_std": 0.36584997177124023,
+      "rewards/accuracy_reward/mean": 0.3984375,
+      "rewards/accuracy_reward/std": 0.4915000796318054,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999400973320007,
+      "sampling/importance_sampling_ratio/min": 0.0001282035664189607,
+      "sampling/sampling_logp_difference/max": 8.961891174316406,
+      "sampling/sampling_logp_difference/mean": 0.019646761938929558,
+      "step": 134
+    },
+    {
+      "clip_ratio/high_max": 1.7107527582993498e-05,
+      "clip_ratio/high_mean": 4.2768818957483745e-06,
+      "clip_ratio/low_mean": 3.014796902789385e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.442485103732906e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15848.0,
+      "completions/max_terminated_length": 15848.0,
+      "completions/mean_length": 5505.9375,
+      "completions/mean_terminated_length": 5505.9375,
+      "completions/min_length": 668.0,
+      "completions/min_terminated_length": 668.0,
+      "entropy": 0.8041045889258385,
+      "epoch": 0.12419503219871206,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0024891747161746025,
+      "learning_rate": 1e-05,
+      "loss": 0.1406,
+      "num_tokens": 102291456.0,
+      "reward": 0.5,
+      "reward_std": 0.35482609272003174,
+      "rewards/accuracy_reward/mean": 0.5,
+      "rewards/accuracy_reward/std": 0.5019646286964417,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999248385429382,
+      "sampling/importance_sampling_ratio/min": 0.0014627616619691253,
+      "sampling/sampling_logp_difference/max": 6.527429103851318,
+      "sampling/sampling_logp_difference/mean": 0.01716250739991665,
+      "step": 135
+    },
+    {
+      "clip_ratio/high_max": 1.548903105685895e-05,
+      "clip_ratio/high_mean": 3.872257764214737e-06,
+      "clip_ratio/low_mean": 5.380711581892683e-05,
+      "clip_ratio/low_min": 4.5777483137499075e-06,
+      "clip_ratio/region_mean": 5.767937363998499e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 16005.0,
+      "completions/max_terminated_length": 16005.0,
+      "completions/mean_length": 5003.0625,
+      "completions/mean_terminated_length": 5003.0625,
+      "completions/min_length": 497.0,
+      "completions/min_terminated_length": 497.0,
+      "entropy": 0.9115714654326439,
+      "epoch": 0.125114995400184,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.00220683915540576,
+      "learning_rate": 1e-05,
+      "loss": 0.1361,
+      "num_tokens": 102949824.0,
+      "reward": 0.4140625,
+      "reward_std": 0.3158818483352661,
+      "rewards/accuracy_reward/mean": 0.4140625,
+      "rewards/accuracy_reward/std": 0.49449479579925537,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999973714351654,
+      "sampling/importance_sampling_ratio/min": 8.323705696966499e-05,
+      "sampling/sampling_logp_difference/max": 9.393817901611328,
+      "sampling/sampling_logp_difference/mean": 0.018076512962579727,
+      "step": 136
+    },
+    {
+      "clip_ratio/high_max": 2.181136096623959e-05,
+      "clip_ratio/high_mean": 5.4528402415598975e-06,
+      "clip_ratio/low_mean": 3.4416837252138066e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.986967681157694e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15658.0,
+      "completions/max_terminated_length": 15658.0,
+      "completions/mean_length": 4742.1328125,
+      "completions/mean_terminated_length": 4742.1328125,
+      "completions/min_length": 462.0,
+      "completions/min_terminated_length": 462.0,
+      "entropy": 0.9430246204137802,
+      "epoch": 0.12603495860165592,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.003964806906878948,
+      "learning_rate": 1e-05,
+      "loss": 0.0215,
+      "num_tokens": 103580913.0,
+      "reward": 0.4609375,
+      "reward_std": 0.2914257347583771,
+      "rewards/accuracy_reward/mean": 0.4609375,
+      "rewards/accuracy_reward/std": 0.5004304051399231,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999952495098114,
+      "sampling/importance_sampling_ratio/min": 7.031940185697749e-05,
+      "sampling/sampling_logp_difference/max": 9.56246280670166,
+      "sampling/sampling_logp_difference/mean": 0.019651200622320175,
+      "step": 137
+    },
+    {
+      "clip_ratio/high_max": 4.07684046876966e-06,
+      "clip_ratio/high_mean": 1.019210117192415e-06,
+      "clip_ratio/low_mean": 3.8682398553646635e-05,
+      "clip_ratio/low_min": 8.189203072106466e-06,
+      "clip_ratio/region_mean": 3.970160832977854e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15944.0,
+      "completions/mean_length": 6574.171875,
+      "completions/mean_terminated_length": 6091.72119140625,
+      "completions/min_length": 10.0,
+      "completions/min_terminated_length": 10.0,
+      "entropy": 0.8429529070854187,
+      "epoch": 0.12695492180312787,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.002067410387098789,
+      "learning_rate": 1e-05,
+      "loss": 0.0377,
+      "num_tokens": 104447463.0,
+      "reward": 0.3125,
+      "reward_std": 0.24511480331420898,
+      "rewards/accuracy_reward/mean": 0.3125,
+      "rewards/accuracy_reward/std": 0.4653336703777313,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9997583627700806,
+      "sampling/importance_sampling_ratio/min": 0.00021258489869069308,
+      "sampling/sampling_logp_difference/max": 8.456169128417969,
+      "sampling/sampling_logp_difference/mean": 0.018853647634387016,
+      "step": 138
+    },
+    {
+      "clip_ratio/high_max": 1.9725823221961036e-05,
+      "clip_ratio/high_mean": 4.931455805490259e-06,
+      "clip_ratio/low_mean": 5.9263072444082354e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 6.419452870431996e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15518.0,
+      "completions/max_terminated_length": 15518.0,
+      "completions/mean_length": 4581.5625,
+      "completions/mean_terminated_length": 4581.5625,
+      "completions/min_length": 301.0,
+      "completions/min_terminated_length": 301.0,
+      "entropy": 0.7094272822141647,
+      "epoch": 0.12787488500459981,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.004292502999305725,
+      "learning_rate": 1e-05,
+      "loss": 0.0946,
+      "num_tokens": 105052287.0,
+      "reward": 0.625,
+      "reward_std": 0.3908300995826721,
+      "rewards/accuracy_reward/mean": 0.625,
+      "rewards/accuracy_reward/std": 0.4860251843929291,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999477863311768,
+      "sampling/importance_sampling_ratio/min": 0.0019342642044648528,
+      "sampling/sampling_logp_difference/max": 6.24802827835083,
+      "sampling/sampling_logp_difference/mean": 0.016310662031173706,
+      "step": 139
+    },
+    {
+      "clip_ratio/high_max": 1.0132298029930098e-05,
+      "clip_ratio/high_mean": 2.5330745074825245e-06,
+      "clip_ratio/low_mean": 4.6397121650443296e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.893019581686531e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16097.0,
+      "completions/mean_length": 7066.4453125,
+      "completions/mean_terminated_length": 6918.5478515625,
+      "completions/min_length": 990.0,
+      "completions/min_terminated_length": 990.0,
+      "entropy": 0.8481669947504997,
+      "epoch": 0.12879484820607176,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0015785128343850374,
+      "learning_rate": 1e-05,
+      "loss": 0.0485,
+      "num_tokens": 105977048.0,
+      "reward": 0.3515625,
+      "reward_std": 0.27328038215637207,
+      "rewards/accuracy_reward/mean": 0.3515625,
+      "rewards/accuracy_reward/std": 0.4793342351913452,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000159740447998,
+      "sampling/importance_sampling_ratio/min": 0.00104097044095397,
+      "sampling/sampling_logp_difference/max": 6.8676018714904785,
+      "sampling/sampling_logp_difference/mean": 0.018304405733942986,
+      "step": 140
+    },
+    {
+      "clip_ratio/high_max": 1.6989023606583942e-05,
+      "clip_ratio/high_mean": 4.2472559016459854e-06,
+      "clip_ratio/low_mean": 2.3075059743860038e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.7322315418132348e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 16104.0,
+      "completions/max_terminated_length": 16104.0,
+      "completions/mean_length": 6230.5234375,
+      "completions/mean_terminated_length": 6230.5234375,
+      "completions/min_length": 220.0,
+      "completions/min_terminated_length": 220.0,
+      "entropy": 0.9658062160015106,
+      "epoch": 0.1297148114075437,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.002542720176279545,
+      "learning_rate": 1e-05,
+      "loss": 0.0725,
+      "num_tokens": 106793187.0,
+      "reward": 0.3203125,
+      "reward_std": 0.3050953149795532,
+      "rewards/accuracy_reward/mean": 0.3203125,
+      "rewards/accuracy_reward/std": 0.4684300124645233,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000169277191162,
+      "sampling/importance_sampling_ratio/min": 0.0002781494113150984,
+      "sampling/sampling_logp_difference/max": 8.187352180480957,
+      "sampling/sampling_logp_difference/mean": 0.019391046836972237,
+      "step": 141
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 2.7597974508353218e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.7597974508353218e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14216.0,
+      "completions/mean_length": 5690.5546875,
+      "completions/mean_terminated_length": 5606.3544921875,
+      "completions/min_length": 1124.0,
+      "completions/min_terminated_length": 1124.0,
+      "entropy": 1.0098655670881271,
+      "epoch": 0.13063477460901565,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.001451602904126048,
+      "learning_rate": 1e-05,
+      "loss": 0.0444,
+      "num_tokens": 107539874.0,
+      "reward": 0.4296875,
+      "reward_std": 0.23304283618927002,
+      "rewards/accuracy_reward/mean": 0.4296875,
+      "rewards/accuracy_reward/std": 0.4969765841960907,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999307990074158,
+      "sampling/importance_sampling_ratio/min": 5.640022671116185e-09,
+      "sampling/sampling_logp_difference/max": 18.993377685546875,
+      "sampling/sampling_logp_difference/mean": 0.018607191741466522,
+      "step": 142
+    },
+    {
+      "clip_ratio/high_max": 1.2800467629858758e-05,
+      "clip_ratio/high_mean": 4.19954119479371e-06,
+      "clip_ratio/low_mean": 2.350350996493944e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.770305115973315e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15791.0,
+      "completions/max_terminated_length": 15791.0,
+      "completions/mean_length": 5471.1328125,
+      "completions/mean_terminated_length": 5471.1328125,
+      "completions/min_length": 4.0,
+      "completions/min_terminated_length": 4.0,
+      "entropy": 1.0413162112236023,
+      "epoch": 0.13155473781048757,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0023549250327050686,
+      "learning_rate": 1e-05,
+      "loss": 0.0411,
+      "num_tokens": 108260091.0,
+      "reward": 0.3203125,
+      "reward_std": 0.30061954259872437,
+      "rewards/accuracy_reward/mean": 0.3203125,
+      "rewards/accuracy_reward/std": 0.4684300124645233,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999832510948181,
+      "sampling/importance_sampling_ratio/min": 0.0011709182290360332,
+      "sampling/sampling_logp_difference/max": 6.749967098236084,
+      "sampling/sampling_logp_difference/mean": 0.020427243784070015,
+      "step": 143
+    },
+    {
+      "clip_ratio/high_max": 2.1983064925734652e-05,
+      "clip_ratio/high_mean": 5.495766231433663e-06,
+      "clip_ratio/low_mean": 4.361141452591255e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.9107180757346214e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16277.0,
+      "completions/mean_length": 6211.7421875,
+      "completions/mean_terminated_length": 6050.2783203125,
+      "completions/min_length": 622.0,
+      "completions/min_terminated_length": 622.0,
+      "entropy": 0.9706784337759018,
+      "epoch": 0.13247470101195952,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0017527056625112891,
+      "learning_rate": 1e-05,
+      "loss": 0.0686,
+      "num_tokens": 109073890.0,
+      "reward": 0.421875,
+      "reward_std": 0.29826050996780396,
+      "rewards/accuracy_reward/mean": 0.421875,
+      "rewards/accuracy_reward/std": 0.4957992732524872,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999092221260071,
+      "sampling/importance_sampling_ratio/min": 0.002898645820096135,
+      "sampling/sampling_logp_difference/max": 5.843511581420898,
+      "sampling/sampling_logp_difference/mean": 0.018898162990808487,
+      "step": 144
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 4.208964992358233e-05,
+      "clip_ratio/low_min": 3.9168990042526275e-06,
+      "clip_ratio/region_mean": 4.208964992358233e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14880.0,
+      "completions/mean_length": 6007.8984375,
+      "completions/mean_terminated_length": 5926.19677734375,
+      "completions/min_length": 156.0,
+      "completions/min_terminated_length": 156.0,
+      "entropy": 1.1967609524726868,
+      "epoch": 0.13339466421343146,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0007858420140109956,
+      "learning_rate": 1e-05,
+      "loss": 0.011,
+      "num_tokens": 109861813.0,
+      "reward": 0.296875,
+      "reward_std": 0.23486506938934326,
+      "rewards/accuracy_reward/mean": 0.296875,
+      "rewards/accuracy_reward/std": 0.45867621898651123,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999340772628784,
+      "sampling/importance_sampling_ratio/min": 3.294382011631569e-08,
+      "sampling/sampling_logp_difference/max": 17.22846221923828,
+      "sampling/sampling_logp_difference/mean": 0.021845955401659012,
+      "step": 145
+    },
+    {
+      "clip_ratio/high_max": 4.5118208618077915e-06,
+      "clip_ratio/high_mean": 1.1279552154519479e-06,
+      "clip_ratio/low_mean": 3.749712686840212e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.8625082197540905e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15838.0,
+      "completions/mean_length": 6800.9921875,
+      "completions/mean_terminated_length": 6725.53564453125,
+      "completions/min_length": 5.0,
+      "completions/min_terminated_length": 5.0,
+      "entropy": 1.0437887012958527,
+      "epoch": 0.1343146274149034,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0029428249690681696,
+      "learning_rate": 1e-05,
+      "loss": 0.0405,
+      "num_tokens": 110756572.0,
+      "reward": 0.265625,
+      "reward_std": 0.3248382806777954,
+      "rewards/accuracy_reward/mean": 0.265625,
+      "rewards/accuracy_reward/std": 0.44340085983276367,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999890327453613,
+      "sampling/importance_sampling_ratio/min": 0.0006329434108920395,
+      "sampling/sampling_logp_difference/max": 7.365129470825195,
+      "sampling/sampling_logp_difference/mean": 0.02010120078921318,
+      "step": 146
+    },
+    {
+      "clip_ratio/high_max": 1.427700522071973e-05,
+      "clip_ratio/high_mean": 3.5692513051799324e-06,
+      "clip_ratio/low_mean": 4.964020990883e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.320946092979284e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15834.0,
+      "completions/mean_length": 6309.4453125,
+      "completions/mean_terminated_length": 6230.1181640625,
+      "completions/min_length": 283.0,
+      "completions/min_terminated_length": 283.0,
+      "entropy": 0.9768906533718109,
+      "epoch": 0.13523459061637536,
+      "frac_reward_zero_std": 0.125,
+      "grad_norm": 0.002088683657348156,
+      "learning_rate": 1e-05,
+      "loss": 0.0316,
+      "num_tokens": 111585493.0,
+      "reward": 0.375,
+      "reward_std": 0.39796435832977295,
+      "rewards/accuracy_reward/mean": 0.375,
+      "rewards/accuracy_reward/std": 0.4860251843929291,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.000007152557373,
+      "sampling/importance_sampling_ratio/min": 0.009723234921693802,
+      "sampling/sampling_logp_difference/max": 4.633236885070801,
+      "sampling/sampling_logp_difference/mean": 0.020927833393216133,
+      "step": 147
+    },
+    {
+      "clip_ratio/high_max": 5.4841398196003865e-06,
+      "clip_ratio/high_mean": 1.3710349549000966e-06,
+      "clip_ratio/low_mean": 5.122006064084417e-05,
+      "clip_ratio/low_min": 3.785125954891555e-06,
+      "clip_ratio/region_mean": 5.25910957094311e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15209.0,
+      "completions/mean_length": 6221.859375,
+      "completions/mean_terminated_length": 6060.5556640625,
+      "completions/min_length": 83.0,
+      "completions/min_terminated_length": 83.0,
+      "entropy": 0.9212924689054489,
+      "epoch": 0.13615455381784727,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.002406956860795617,
+      "learning_rate": 1e-05,
+      "loss": 0.1051,
+      "num_tokens": 112400363.0,
+      "reward": 0.40625,
+      "reward_std": 0.31929677724838257,
+      "rewards/accuracy_reward/mean": 0.40625,
+      "rewards/accuracy_reward/std": 0.4930621087551117,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999701976776123,
+      "sampling/importance_sampling_ratio/min": 5.8308287407271564e-05,
+      "sampling/sampling_logp_difference/max": 9.74976634979248,
+      "sampling/sampling_logp_difference/mean": 0.018652018159627914,
+      "step": 148
+    },
+    {
+      "clip_ratio/high_max": 1.4568151755156578e-05,
+      "clip_ratio/high_mean": 3.6420379387891444e-06,
+      "clip_ratio/low_mean": 3.999794398623635e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.3639981413434725e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14997.0,
+      "completions/mean_length": 6942.8203125,
+      "completions/mean_terminated_length": 6716.232421875,
+      "completions/min_length": 200.0,
+      "completions/min_terminated_length": 200.0,
+      "entropy": 0.949538916349411,
+      "epoch": 0.13707451701931922,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0022962254006415606,
+      "learning_rate": 1e-05,
+      "loss": 0.0625,
+      "num_tokens": 113308748.0,
+      "reward": 0.375,
+      "reward_std": 0.3329663872718811,
+      "rewards/accuracy_reward/mean": 0.375,
+      "rewards/accuracy_reward/std": 0.4860251843929291,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999334812164307,
+      "sampling/importance_sampling_ratio/min": 0.00048810525913722813,
+      "sampling/sampling_logp_difference/max": 7.624979496002197,
+      "sampling/sampling_logp_difference/mean": 0.01939917355775833,
+      "step": 149
+    },
+    {
+      "clip_ratio/high_max": 8.786732450971613e-06,
+      "clip_ratio/high_mean": 2.196683112742903e-06,
+      "clip_ratio/low_mean": 5.562954720517155e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.7826231113722315e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15182.0,
+      "completions/mean_length": 6783.1796875,
+      "completions/mean_terminated_length": 6552.76025390625,
+      "completions/min_length": 7.0,
+      "completions/min_terminated_length": 7.0,
+      "entropy": 0.9774708449840546,
+      "epoch": 0.13799448022079117,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0020560629200190306,
+      "learning_rate": 1e-05,
+      "loss": 0.0473,
+      "num_tokens": 114196235.0,
+      "reward": 0.34375,
+      "reward_std": 0.3066929578781128,
+      "rewards/accuracy_reward/mean": 0.34375,
+      "rewards/accuracy_reward/std": 0.47682511806488037,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998990297317505,
+      "sampling/importance_sampling_ratio/min": 2.4757892447269114e-07,
+      "sampling/sampling_logp_difference/max": 15.211536407470703,
+      "sampling/sampling_logp_difference/mean": 0.019691556692123413,
+      "step": 150
+    },
+    {
+      "clip_ratio/high_max": 1.799483243303257e-05,
+      "clip_ratio/high_mean": 4.498708108258143e-06,
+      "clip_ratio/low_mean": 2.6389980291696702e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.0888688343111426e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15549.0,
+      "completions/mean_length": 5568.15625,
+      "completions/mean_terminated_length": 5396.4765625,
+      "completions/min_length": 271.0,
+      "completions/min_terminated_length": 271.0,
+      "entropy": 0.9303529411554337,
+      "epoch": 0.1389144434222631,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0022214846685528755,
+      "learning_rate": 1e-05,
+      "loss": 0.0187,
+      "num_tokens": 114928047.0,
+      "reward": 0.234375,
+      "reward_std": 0.2585597634315491,
+      "rewards/accuracy_reward/mean": 0.234375,
+      "rewards/accuracy_reward/std": 0.42527204751968384,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999408721923828,
+      "sampling/importance_sampling_ratio/min": 2.1446083337650634e-05,
+      "sampling/sampling_logp_difference/max": 10.749968528747559,
+      "sampling/sampling_logp_difference/mean": 0.01938418298959732,
+      "step": 151
+    },
+    {
+      "clip_ratio/high_max": 1.1957493370573502e-05,
+      "clip_ratio/high_mean": 2.9893733426433755e-06,
+      "clip_ratio/low_mean": 5.885063319510664e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 6.184000585562899e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15340.0,
+      "completions/max_terminated_length": 15340.0,
+      "completions/mean_length": 6086.578125,
+      "completions/mean_terminated_length": 6086.578125,
+      "completions/min_length": 919.0,
+      "completions/min_terminated_length": 919.0,
+      "entropy": 0.9131873697042465,
+      "epoch": 0.13983440662373506,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.002448044717311859,
+      "learning_rate": 1e-05,
+      "loss": 0.0599,
+      "num_tokens": 115725657.0,
+      "reward": 0.40625,
+      "reward_std": 0.35878273844718933,
+      "rewards/accuracy_reward/mean": 0.40625,
+      "rewards/accuracy_reward/std": 0.4930621087551117,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999779462814331,
+      "sampling/importance_sampling_ratio/min": 0.02929726243019104,
+      "sampling/sampling_logp_difference/max": 3.530261278152466,
+      "sampling/sampling_logp_difference/mean": 0.019298439845442772,
+      "step": 152
+    },
+    {
+      "clip_ratio/high_max": 1.3385357760853367e-05,
+      "clip_ratio/high_mean": 3.3463394402133417e-06,
+      "clip_ratio/low_mean": 5.717015119444113e-05,
+      "clip_ratio/low_min": 3.4328400033700746e-06,
+      "clip_ratio/region_mean": 6.0516490520967636e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15987.0,
+      "completions/mean_length": 6442.5390625,
+      "completions/mean_terminated_length": 6203.9443359375,
+      "completions/min_length": 574.0,
+      "completions/min_terminated_length": 574.0,
+      "entropy": 0.8959419652819633,
+      "epoch": 0.140754369825207,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.002013204852119088,
+      "learning_rate": 1e-05,
+      "loss": 0.0281,
+      "num_tokens": 116571478.0,
+      "reward": 0.2734375,
+      "reward_std": 0.2909066081047058,
+      "rewards/accuracy_reward/mean": 0.2734375,
+      "rewards/accuracy_reward/std": 0.447474867105484,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.000044584274292,
+      "sampling/importance_sampling_ratio/min": 1.0374163821325055e-06,
+      "sampling/sampling_logp_difference/max": 13.778777122497559,
+      "sampling/sampling_logp_difference/mean": 0.01925014518201351,
+      "step": 153
+    },
+    {
+      "clip_ratio/high_max": 9.34224021875707e-06,
+      "clip_ratio/high_mean": 3.136903728773177e-06,
+      "clip_ratio/low_mean": 2.9738095065567904e-05,
+      "clip_ratio/low_min": 3.7240065466903616e-06,
+      "clip_ratio/region_mean": 3.2874999135401595e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15946.0,
+      "completions/mean_length": 6633.5703125,
+      "completions/mean_terminated_length": 6319.0400390625,
+      "completions/min_length": 358.0,
+      "completions/min_terminated_length": 358.0,
+      "entropy": 1.0223619118332863,
+      "epoch": 0.14167433302667892,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0024523327592760324,
+      "learning_rate": 1e-05,
+      "loss": 0.056,
+      "num_tokens": 117440743.0,
+      "reward": 0.3203125,
+      "reward_std": 0.30061954259872437,
+      "rewards/accuracy_reward/mean": 0.3203125,
+      "rewards/accuracy_reward/std": 0.4684300124645233,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999213218688965,
+      "sampling/importance_sampling_ratio/min": 3.0026931199245155e-05,
+      "sampling/sampling_logp_difference/max": 10.413415908813477,
+      "sampling/sampling_logp_difference/mean": 0.02061290666460991,
+      "step": 154
+    },
+    {
+      "clip_ratio/high_max": 1.4537483366439119e-05,
+      "clip_ratio/high_mean": 3.6343708416097797e-06,
+      "clip_ratio/low_mean": 3.954866042477079e-05,
+      "clip_ratio/low_min": 9.874949228105834e-06,
+      "clip_ratio/region_mean": 4.318303126638057e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15919.0,
+      "completions/mean_length": 7183.0,
+      "completions/mean_terminated_length": 6886.193359375,
+      "completions/min_length": 357.0,
+      "completions/min_terminated_length": 357.0,
+      "entropy": 0.9815369099378586,
+      "epoch": 0.14259429622815087,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0018688985146582127,
+      "learning_rate": 1e-05,
+      "loss": 0.0395,
+      "num_tokens": 118380687.0,
+      "reward": 0.2890625,
+      "reward_std": 0.2498900145292282,
+      "rewards/accuracy_reward/mean": 0.2890625,
+      "rewards/accuracy_reward/std": 0.45510825514793396,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999039173126221,
+      "sampling/importance_sampling_ratio/min": 1.3847662557964213e-05,
+      "sampling/sampling_logp_difference/max": 11.187394142150879,
+      "sampling/sampling_logp_difference/mean": 0.019792160019278526,
+      "step": 155
+    },
+    {
+      "clip_ratio/high_max": 7.165636361605721e-06,
+      "clip_ratio/high_mean": 1.7914090904014301e-06,
+      "clip_ratio/low_mean": 4.9011068711024564e-05,
+      "clip_ratio/low_min": 1.0991705721608014e-05,
+      "clip_ratio/region_mean": 5.0802477687739156e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16246.0,
+      "completions/mean_length": 6324.640625,
+      "completions/mean_terminated_length": 5829.91748046875,
+      "completions/min_length": 45.0,
+      "completions/min_terminated_length": 45.0,
+      "entropy": 0.852975606918335,
+      "epoch": 0.14351425942962281,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.002005894435569644,
+      "learning_rate": 1e-05,
+      "loss": 0.0041,
+      "num_tokens": 119207089.0,
+      "reward": 0.3984375,
+      "reward_std": 0.2858940362930298,
+      "rewards/accuracy_reward/mean": 0.3984375,
+      "rewards/accuracy_reward/std": 0.4915000796318054,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000035762786865,
+      "sampling/importance_sampling_ratio/min": 5.788659223071591e-07,
+      "sampling/sampling_logp_difference/max": 14.362195014953613,
+      "sampling/sampling_logp_difference/mean": 0.01853565312922001,
+      "step": 156
+    },
+    {
+      "clip_ratio/high_max": 7.795394822096569e-06,
+      "clip_ratio/high_mean": 1.948848705524142e-06,
+      "clip_ratio/low_mean": 3.834237736555224e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.0291225786859286e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16281.0,
+      "completions/mean_length": 5723.421875,
+      "completions/mean_terminated_length": 5290.06494140625,
+      "completions/min_length": 355.0,
+      "completions/min_terminated_length": 355.0,
+      "entropy": 0.8744911625981331,
+      "epoch": 0.14443422263109476,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.002577397273853421,
+      "learning_rate": 1e-05,
+      "loss": 0.0603,
+      "num_tokens": 119961895.0,
+      "reward": 0.390625,
+      "reward_std": 0.34321609139442444,
+      "rewards/accuracy_reward/mean": 0.390625,
+      "rewards/accuracy_reward/std": 0.4898075461387634,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999703764915466,
+      "sampling/importance_sampling_ratio/min": 0.07882421463727951,
+      "sampling/sampling_logp_difference/max": 2.5405349731445312,
+      "sampling/sampling_logp_difference/mean": 0.018341556191444397,
+      "step": 157
+    },
+    {
+      "clip_ratio/high_max": 9.214097190124448e-06,
+      "clip_ratio/high_mean": 2.303524297531112e-06,
+      "clip_ratio/low_mean": 2.636873176697918e-05,
+      "clip_ratio/low_min": 2.9339967113628518e-06,
+      "clip_ratio/region_mean": 2.8672255837136618e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16055.0,
+      "completions/mean_length": 7886.015625,
+      "completions/mean_terminated_length": 7682.064453125,
+      "completions/min_length": 989.0,
+      "completions/min_terminated_length": 989.0,
+      "entropy": 0.9391767829656601,
+      "epoch": 0.1453541858325667,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.002552987542003393,
+      "learning_rate": 1e-05,
+      "loss": 0.0273,
+      "num_tokens": 120990289.0,
+      "reward": 0.328125,
+      "reward_std": 0.2522490322589874,
+      "rewards/accuracy_reward/mean": 0.328125,
+      "rewards/accuracy_reward/std": 0.4713755249977112,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000030994415283,
+      "sampling/importance_sampling_ratio/min": 0.000899312668479979,
+      "sampling/sampling_logp_difference/max": 7.013879776000977,
+      "sampling/sampling_logp_difference/mean": 0.02049873024225235,
+      "step": 158
+    },
+    {
+      "clip_ratio/high_max": 3.406416203688423e-05,
+      "clip_ratio/high_mean": 9.72330332160709e-06,
+      "clip_ratio/low_mean": 3.168332909808669e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.140663151019908e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16276.0,
+      "completions/mean_length": 6173.1640625,
+      "completions/mean_terminated_length": 6011.087890625,
+      "completions/min_length": 445.0,
+      "completions/min_terminated_length": 445.0,
+      "entropy": 0.9148785546422005,
+      "epoch": 0.14627414903403863,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.002678362652659416,
+      "learning_rate": 1e-05,
+      "loss": 0.039,
+      "num_tokens": 121797958.0,
+      "reward": 0.4140625,
+      "reward_std": 0.3608373999595642,
+      "rewards/accuracy_reward/mean": 0.4140625,
+      "rewards/accuracy_reward/std": 0.49449479579925537,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999265074729919,
+      "sampling/importance_sampling_ratio/min": 0.002013920107856393,
+      "sampling/sampling_logp_difference/max": 6.207672119140625,
+      "sampling/sampling_logp_difference/mean": 0.018977735191583633,
+      "step": 159
+    },
+    {
+      "clip_ratio/high_max": 1.8476588593330234e-05,
+      "clip_ratio/high_mean": 4.6191471483325586e-06,
+      "clip_ratio/low_mean": 4.459614581264759e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.9215293188353826e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15825.0,
+      "completions/mean_length": 6594.21875,
+      "completions/mean_terminated_length": 6196.259765625,
+      "completions/min_length": 196.0,
+      "completions/min_terminated_length": 196.0,
+      "entropy": 0.9486038386821747,
+      "epoch": 0.14719411223551057,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0033711253199726343,
+      "learning_rate": 1e-05,
+      "loss": 0.026,
+      "num_tokens": 122661170.0,
+      "reward": 0.3828125,
+      "reward_std": 0.30457615852355957,
+      "rewards/accuracy_reward/mean": 0.3828125,
+      "rewards/accuracy_reward/std": 0.4879830479621887,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998981356620789,
+      "sampling/importance_sampling_ratio/min": 0.0002968419576063752,
+      "sampling/sampling_logp_difference/max": 8.122310638427734,
+      "sampling/sampling_logp_difference/mean": 0.01938377134501934,
+      "step": 160
+    },
+    {
+      "clip_ratio/high_max": 7.97335997049231e-06,
+      "clip_ratio/high_mean": 2.7343705824023345e-06,
+      "clip_ratio/low_mean": 5.420079878604156e-05,
+      "clip_ratio/low_min": 4.594068286678521e-06,
+      "clip_ratio/region_mean": 5.693517005056492e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15928.0,
+      "completions/mean_length": 6533.9453125,
+      "completions/mean_terminated_length": 6377.595703125,
+      "completions/min_length": 524.0,
+      "completions/min_terminated_length": 524.0,
+      "entropy": 0.9986584335565567,
+      "epoch": 0.14811407543698252,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0017857529455795884,
+      "learning_rate": 1e-05,
+      "loss": 0.0804,
+      "num_tokens": 123518107.0,
+      "reward": 0.34375,
+      "reward_std": 0.3356248140335083,
+      "rewards/accuracy_reward/mean": 0.34375,
+      "rewards/accuracy_reward/std": 0.47682511806488037,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998549818992615,
+      "sampling/importance_sampling_ratio/min": 9.012701411847956e-06,
+      "sampling/sampling_logp_difference/max": 11.616875648498535,
+      "sampling/sampling_logp_difference/mean": 0.02010391652584076,
+      "step": 161
+    },
+    {
+      "clip_ratio/high_max": 4.470512521947967e-06,
+      "clip_ratio/high_mean": 1.1176281304869917e-06,
+      "clip_ratio/low_mean": 3.5141094485879876e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.625872295742738e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 13212.0,
+      "completions/mean_length": 5742.21875,
+      "completions/mean_terminated_length": 5658.42529296875,
+      "completions/min_length": 235.0,
+      "completions/min_terminated_length": 235.0,
+      "entropy": 1.0379670709371567,
+      "epoch": 0.14903403863845446,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0018227624241262674,
+      "learning_rate": 1e-05,
+      "loss": -0.0237,
+      "num_tokens": 124279031.0,
+      "reward": 0.21875,
+      "reward_std": 0.2869548797607422,
+      "rewards/accuracy_reward/mean": 0.21875,
+      "rewards/accuracy_reward/std": 0.41502299904823303,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998506903648376,
+      "sampling/importance_sampling_ratio/min": 0.0020977305248379707,
+      "sampling/sampling_logp_difference/max": 6.16689920425415,
+      "sampling/sampling_logp_difference/mean": 0.019987668842077255,
+      "step": 162
+    },
+    {
+      "clip_ratio/high_max": 1.0003542683989508e-05,
+      "clip_ratio/high_mean": 3.21091931709816e-06,
+      "clip_ratio/low_mean": 5.731009014198207e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 6.0521009800140746e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16354.0,
+      "completions/mean_length": 7584.703125,
+      "completions/mean_terminated_length": 7515.41748046875,
+      "completions/min_length": 884.0,
+      "completions/min_terminated_length": 884.0,
+      "entropy": 0.953459307551384,
+      "epoch": 0.1499540018399264,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.002219022251665592,
+      "learning_rate": 1e-05,
+      "loss": 0.0837,
+      "num_tokens": 125270761.0,
+      "reward": 0.359375,
+      "reward_std": 0.37033066153526306,
+      "rewards/accuracy_reward/mean": 0.359375,
+      "rewards/accuracy_reward/std": 0.481702595949173,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999880790710449,
+      "sampling/importance_sampling_ratio/min": 0.0024849213659763336,
+      "sampling/sampling_logp_difference/max": 5.997514247894287,
+      "sampling/sampling_logp_difference/mean": 0.020291510969400406,
+      "step": 163
+    },
+    {
+      "clip_ratio/high_max": 7.734669452474918e-06,
+      "clip_ratio/high_mean": 1.9336673631187296e-06,
+      "clip_ratio/low_mean": 3.1135301298945706e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.3068968605221016e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16133.0,
+      "completions/mean_length": 4714.671875,
+      "completions/mean_terminated_length": 4622.78759765625,
+      "completions/min_length": 371.0,
+      "completions/min_terminated_length": 371.0,
+      "entropy": 1.018719919025898,
+      "epoch": 0.15087396504139836,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0014189074281603098,
+      "learning_rate": 1e-05,
+      "loss": 0.0501,
+      "num_tokens": 125895279.0,
+      "reward": 0.3984375,
+      "reward_std": 0.28383445739746094,
+      "rewards/accuracy_reward/mean": 0.3984375,
+      "rewards/accuracy_reward/std": 0.4915000796318054,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999479651451111,
+      "sampling/importance_sampling_ratio/min": 4.017410901724361e-07,
+      "sampling/sampling_logp_difference/max": 14.727458000183105,
+      "sampling/sampling_logp_difference/mean": 0.018739396706223488,
+      "step": 164
+    },
+    {
+      "clip_ratio/high_max": 1.0069575182569679e-05,
+      "clip_ratio/high_mean": 2.5173937956424197e-06,
+      "clip_ratio/low_mean": 3.824179225375701e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.0759185367278405e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15913.0,
+      "completions/mean_length": 6316.140625,
+      "completions/mean_terminated_length": 6074.51220703125,
+      "completions/min_length": 751.0,
+      "completions/min_terminated_length": 751.0,
+      "entropy": 0.9325072392821312,
+      "epoch": 0.15179392824287027,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.001702460227534175,
+      "learning_rate": 1e-05,
+      "loss": 0.1007,
+      "num_tokens": 126722881.0,
+      "reward": 0.4609375,
+      "reward_std": 0.3056321144104004,
+      "rewards/accuracy_reward/mean": 0.4609375,
+      "rewards/accuracy_reward/std": 0.5004304051399231,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999539852142334,
+      "sampling/importance_sampling_ratio/min": 0.0012551364488899708,
+      "sampling/sampling_logp_difference/max": 6.680510997772217,
+      "sampling/sampling_logp_difference/mean": 0.01929408684372902,
+      "step": 165
+    },
+    {
+      "clip_ratio/high_max": 6.873041002108948e-06,
+      "clip_ratio/high_mean": 1.718260250527237e-06,
+      "clip_ratio/low_mean": 3.119859468370123e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.291685527528898e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15832.0,
+      "completions/mean_length": 4687.140625,
+      "completions/mean_terminated_length": 4595.03955078125,
+      "completions/min_length": 310.0,
+      "completions/min_terminated_length": 310.0,
+      "entropy": 1.0886607319116592,
+      "epoch": 0.15271389144434222,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0032931750174611807,
+      "learning_rate": 1e-05,
+      "loss": 0.0078,
+      "num_tokens": 127341715.0,
+      "reward": 0.28125,
+      "reward_std": 0.2835350036621094,
+      "rewards/accuracy_reward/mean": 0.28125,
+      "rewards/accuracy_reward/std": 0.4513758420944214,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999821186065674,
+      "sampling/importance_sampling_ratio/min": 0.0019364450126886368,
+      "sampling/sampling_logp_difference/max": 6.246901512145996,
+      "sampling/sampling_logp_difference/mean": 0.020621225237846375,
+      "step": 166
+    },
+    {
+      "clip_ratio/high_max": 1.773085250533768e-05,
+      "clip_ratio/high_mean": 4.43271312633442e-06,
+      "clip_ratio/low_mean": 4.30743207289197e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.7507033741567284e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14125.0,
+      "completions/mean_length": 5705.515625,
+      "completions/mean_terminated_length": 5449.232421875,
+      "completions/min_length": 4.0,
+      "completions/min_terminated_length": 4.0,
+      "entropy": 1.0523068830370903,
+      "epoch": 0.15363385464581417,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0031696646474301815,
+      "learning_rate": 1e-05,
+      "loss": -0.0414,
+      "num_tokens": 128093597.0,
+      "reward": 0.1953125,
+      "reward_std": 0.21778053045272827,
+      "rewards/accuracy_reward/mean": 0.1953125,
+      "rewards/accuracy_reward/std": 0.3979988098144531,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999619126319885,
+      "sampling/importance_sampling_ratio/min": 3.197810656274669e-05,
+      "sampling/sampling_logp_difference/max": 10.350459098815918,
+      "sampling/sampling_logp_difference/mean": 0.021961934864521027,
+      "step": 167
+    },
+    {
+      "clip_ratio/high_max": 1.885905066956184e-05,
+      "clip_ratio/high_mean": 4.71476266739046e-06,
+      "clip_ratio/low_mean": 5.0530389898995054e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.524515336219338e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15958.0,
+      "completions/mean_length": 6214.4921875,
+      "completions/mean_terminated_length": 6053.07177734375,
+      "completions/min_length": 533.0,
+      "completions/min_terminated_length": 533.0,
+      "entropy": 0.9371421113610268,
+      "epoch": 0.1545538178472861,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0023704832419753075,
+      "learning_rate": 1e-05,
+      "loss": 0.075,
+      "num_tokens": 128906948.0,
+      "reward": 0.40625,
+      "reward_std": 0.34139877557754517,
+      "rewards/accuracy_reward/mean": 0.40625,
+      "rewards/accuracy_reward/std": 0.4930621087551117,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.000023365020752,
+      "sampling/importance_sampling_ratio/min": 0.0003354824730195105,
+      "sampling/sampling_logp_difference/max": 7.999940872192383,
+      "sampling/sampling_logp_difference/mean": 0.01882763020694256,
+      "step": 168
+    },
+    {
+      "clip_ratio/high_max": 3.042072216885572e-05,
+      "clip_ratio/high_mean": 7.60518054221393e-06,
+      "clip_ratio/low_mean": 4.5897569179942366e-05,
+      "clip_ratio/low_min": 8.727477506909054e-06,
+      "clip_ratio/region_mean": 5.3502750233747065e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15865.0,
+      "completions/mean_length": 7127.0703125,
+      "completions/mean_terminated_length": 7054.18115234375,
+      "completions/min_length": 402.0,
+      "completions/min_terminated_length": 402.0,
+      "entropy": 0.9854387491941452,
+      "epoch": 0.15547378104875806,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.003370177699252963,
+      "learning_rate": 1e-05,
+      "loss": 0.1197,
+      "num_tokens": 129839813.0,
+      "reward": 0.359375,
+      "reward_std": 0.3329663574695587,
+      "rewards/accuracy_reward/mean": 0.359375,
+      "rewards/accuracy_reward/std": 0.481702595949173,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999907910823822,
+      "sampling/importance_sampling_ratio/min": 1.077816432371037e-05,
+      "sampling/sampling_logp_difference/max": 11.43798828125,
+      "sampling/sampling_logp_difference/mean": 0.019736800342798233,
+      "step": 169
+    },
+    {
+      "clip_ratio/high_max": 2.1401074718596647e-05,
+      "clip_ratio/high_mean": 6.243764005375851e-06,
+      "clip_ratio/low_mean": 3.2797592325550795e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.904135610355297e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15865.0,
+      "completions/mean_length": 6566.2890625,
+      "completions/mean_terminated_length": 6330.6640625,
+      "completions/min_length": 969.0,
+      "completions/min_terminated_length": 969.0,
+      "entropy": 0.7978609576821327,
+      "epoch": 0.15639374425023,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0026055986527353525,
+      "learning_rate": 1e-05,
+      "loss": 0.0661,
+      "num_tokens": 130698370.0,
+      "reward": 0.5,
+      "reward_std": 0.36295419931411743,
+      "rewards/accuracy_reward/mean": 0.5,
+      "rewards/accuracy_reward/std": 0.5019646286964417,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999133944511414,
+      "sampling/importance_sampling_ratio/min": 0.00031152591691352427,
+      "sampling/sampling_logp_difference/max": 8.074028015136719,
+      "sampling/sampling_logp_difference/mean": 0.01787097379565239,
+      "step": 170
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 2.0564424403346493e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.0564424403346493e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15576.0,
+      "completions/max_terminated_length": 15576.0,
+      "completions/mean_length": 7186.2890625,
+      "completions/mean_terminated_length": 7186.2890625,
+      "completions/min_length": 351.0,
+      "completions/min_terminated_length": 351.0,
+      "entropy": 1.0232757329940796,
+      "epoch": 0.15731370745170192,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.0023866184055805206,
+      "learning_rate": 1e-05,
+      "loss": 0.0683,
+      "num_tokens": 131637439.0,
+      "reward": 0.2734375,
+      "reward_std": 0.2059282809495926,
+      "rewards/accuracy_reward/mean": 0.2734375,
+      "rewards/accuracy_reward/std": 0.447474867105484,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999207258224487,
+      "sampling/importance_sampling_ratio/min": 0.0007378471200354397,
+      "sampling/sampling_logp_difference/max": 7.211773872375488,
+      "sampling/sampling_logp_difference/mean": 0.02137116715312004,
+      "step": 171
+    },
+    {
+      "clip_ratio/high_max": 4.037900725961663e-05,
+      "clip_ratio/high_mean": 1.0094751814904157e-05,
+      "clip_ratio/low_mean": 5.8380828136250784e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 6.847557995115494e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 13638.0,
+      "completions/mean_length": 5591.5703125,
+      "completions/mean_terminated_length": 5420.26220703125,
+      "completions/min_length": 635.0,
+      "completions/min_terminated_length": 635.0,
+      "entropy": 0.9335208311676979,
+      "epoch": 0.15823367065317387,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.003491115989163518,
+      "learning_rate": 1e-05,
+      "loss": 0.0699,
+      "num_tokens": 132371816.0,
+      "reward": 0.5,
+      "reward_std": 0.3406373858451843,
+      "rewards/accuracy_reward/mean": 0.5,
+      "rewards/accuracy_reward/std": 0.5019646286964417,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999891459941864,
+      "sampling/importance_sampling_ratio/min": 0.00012356207298580557,
+      "sampling/sampling_logp_difference/max": 8.998766899108887,
+      "sampling/sampling_logp_difference/mean": 0.018760837614536285,
+      "step": 172
+    },
+    {
+      "clip_ratio/high_max": 2.8378776733006816e-06,
+      "clip_ratio/high_mean": 7.094694183251704e-07,
+      "clip_ratio/low_mean": 4.4085751369493664e-05,
+      "clip_ratio/low_min": 6.7955093072669115e-06,
+      "clip_ratio/region_mean": 4.4795220674132e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16302.0,
+      "completions/mean_length": 7152.3828125,
+      "completions/mean_terminated_length": 6930.82421875,
+      "completions/min_length": 281.0,
+      "completions/min_terminated_length": 281.0,
+      "entropy": 1.1329835206270218,
+      "epoch": 0.15915363385464582,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.002830669516697526,
+      "learning_rate": 1e-05,
+      "loss": 0.0526,
+      "num_tokens": 133307297.0,
+      "reward": 0.28125,
+      "reward_std": 0.28801077604293823,
+      "rewards/accuracy_reward/mean": 0.28125,
+      "rewards/accuracy_reward/std": 0.4513758420944214,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999501705169678,
+      "sampling/importance_sampling_ratio/min": 0.00028047082014381886,
+      "sampling/sampling_logp_difference/max": 8.179040908813477,
+      "sampling/sampling_logp_difference/mean": 0.021548541262745857,
+      "step": 173
+    },
+    {
+      "clip_ratio/high_max": 1.0150829439226072e-05,
+      "clip_ratio/high_mean": 2.537707359806518e-06,
+      "clip_ratio/low_mean": 3.4009618616437365e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.654732597624388e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15068.0,
+      "completions/mean_length": 7263.453125,
+      "completions/mean_terminated_length": 7118.68310546875,
+      "completions/min_length": 352.0,
+      "completions/min_terminated_length": 352.0,
+      "entropy": 1.092760555446148,
+      "epoch": 0.16007359705611776,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0027821618132293224,
+      "learning_rate": 1e-05,
+      "loss": 0.0541,
+      "num_tokens": 134260107.0,
+      "reward": 0.3203125,
+      "reward_std": 0.2858940362930298,
+      "rewards/accuracy_reward/mean": 0.3203125,
+      "rewards/accuracy_reward/std": 0.4684300124645233,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999946117401123,
+      "sampling/importance_sampling_ratio/min": 7.832317351130769e-05,
+      "sampling/sampling_logp_difference/max": 9.454667091369629,
+      "sampling/sampling_logp_difference/mean": 0.022098438814282417,
+      "step": 174
+    },
+    {
+      "clip_ratio/high_max": 1.0561876024439698e-05,
+      "clip_ratio/high_mean": 2.6404690061099245e-06,
+      "clip_ratio/low_mean": 1.6864279416495265e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 1.9504748649978865e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15388.0,
+      "completions/mean_length": 7088.8125,
+      "completions/mean_terminated_length": 6710.958984375,
+      "completions/min_length": 1314.0,
+      "completions/min_terminated_length": 1314.0,
+      "entropy": 1.0669445469975471,
+      "epoch": 0.1609935602575897,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.0007076738984324038,
+      "learning_rate": 1e-05,
+      "loss": -0.0197,
+      "num_tokens": 135186139.0,
+      "reward": 0.328125,
+      "reward_std": 0.20593319833278656,
+      "rewards/accuracy_reward/mean": 0.328125,
+      "rewards/accuracy_reward/std": 0.4713755249977112,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998199343681335,
+      "sampling/importance_sampling_ratio/min": 3.084653872065246e-05,
+      "sampling/sampling_logp_difference/max": 10.386486053466797,
+      "sampling/sampling_logp_difference/mean": 0.020075790584087372,
+      "step": 175
+    },
+    {
+      "clip_ratio/high_max": 7.095016371749807e-06,
+      "clip_ratio/high_mean": 1.7737540929374518e-06,
+      "clip_ratio/low_mean": 2.7592465016823553e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.936621888238733e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15626.0,
+      "completions/max_terminated_length": 15626.0,
+      "completions/mean_length": 5352.734375,
+      "completions/mean_terminated_length": 5352.734375,
+      "completions/min_length": 333.0,
+      "completions/min_terminated_length": 333.0,
+      "entropy": 1.0387161895632744,
+      "epoch": 0.16191352345906163,
+      "frac_reward_zero_std": 0.125,
+      "grad_norm": 0.0022445612121373415,
+      "learning_rate": 1e-05,
+      "loss": 0.0261,
+      "num_tokens": 135888929.0,
+      "reward": 0.4765625,
+      "reward_std": 0.399257630109787,
+      "rewards/accuracy_reward/mean": 0.4765625,
+      "rewards/accuracy_reward/std": 0.5014128684997559,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999054670333862,
+      "sampling/importance_sampling_ratio/min": 0.00032565294532105327,
+      "sampling/sampling_logp_difference/max": 8.029678344726562,
+      "sampling/sampling_logp_difference/mean": 0.02010166086256504,
+      "step": 176
+    },
+    {
+      "clip_ratio/high_max": 1.5100852124305675e-05,
+      "clip_ratio/high_mean": 4.426987970873597e-06,
+      "clip_ratio/low_mean": 2.7625993425317574e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.2052981168817496e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16266.0,
+      "completions/mean_length": 7758.90625,
+      "completions/mean_terminated_length": 7408.29248046875,
+      "completions/min_length": 742.0,
+      "completions/min_terminated_length": 742.0,
+      "entropy": 1.0648984238505363,
+      "epoch": 0.16283348666053357,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0022021254990249872,
+      "learning_rate": 1e-05,
+      "loss": 0.0621,
+      "num_tokens": 136901941.0,
+      "reward": 0.3671875,
+      "reward_std": 0.2914257347583771,
+      "rewards/accuracy_reward/mean": 0.3671875,
+      "rewards/accuracy_reward/std": 0.4839322865009308,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999858140945435,
+      "sampling/importance_sampling_ratio/min": 2.2461865967216e-07,
+      "sampling/sampling_logp_difference/max": 15.30886173248291,
+      "sampling/sampling_logp_difference/mean": 0.021426808089017868,
+      "step": 177
+    },
+    {
+      "clip_ratio/high_max": 2.5346608254039893e-05,
+      "clip_ratio/high_mean": 7.4063813144675805e-06,
+      "clip_ratio/low_mean": 2.2069365058996482e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.9475746259777225e-05,
+      "completions/clipped_ratio": 0.0546875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16277.0,
+      "completions/mean_length": 7036.953125,
+      "completions/mean_terminated_length": 6496.21484375,
+      "completions/min_length": 4.0,
+      "completions/min_terminated_length": 4.0,
+      "entropy": 0.9684997871518135,
+      "epoch": 0.16375344986200552,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0013461806811392307,
+      "learning_rate": 1e-05,
+      "loss": 0.035,
+      "num_tokens": 137824623.0,
+      "reward": 0.34375,
+      "reward_std": 0.2546031177043915,
+      "rewards/accuracy_reward/mean": 0.34375,
+      "rewards/accuracy_reward/std": 0.47682511806488037,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999944806098938,
+      "sampling/importance_sampling_ratio/min": 5.834372132085264e-05,
+      "sampling/sampling_logp_difference/max": 9.74915885925293,
+      "sampling/sampling_logp_difference/mean": 0.020304443314671516,
+      "step": 178
+    },
+    {
+      "clip_ratio/high_max": 1.3147734080121154e-05,
+      "clip_ratio/high_mean": 3.2869335200302885e-06,
+      "clip_ratio/low_mean": 4.841489999307669e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.170183294467279e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15500.0,
+      "completions/mean_length": 6114.1875,
+      "completions/mean_terminated_length": 5951.1748046875,
+      "completions/min_length": 223.0,
+      "completions/min_terminated_length": 223.0,
+      "entropy": 0.943072073161602,
+      "epoch": 0.16467341306347746,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.002132438588887453,
+      "learning_rate": 1e-05,
+      "loss": 0.0943,
+      "num_tokens": 138625247.0,
+      "reward": 0.40625,
+      "reward_std": 0.321650892496109,
+      "rewards/accuracy_reward/mean": 0.40625,
+      "rewards/accuracy_reward/std": 0.4930621087551117,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999298453330994,
+      "sampling/importance_sampling_ratio/min": 0.0017275095451623201,
+      "sampling/sampling_logp_difference/max": 6.361074447631836,
+      "sampling/sampling_logp_difference/mean": 0.020084267482161522,
+      "step": 179
+    },
+    {
+      "clip_ratio/high_max": 1.7873157958092634e-05,
+      "clip_ratio/high_mean": 4.468289489523158e-06,
+      "clip_ratio/low_mean": 3.5252990301160025e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.9721279790683184e-05,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15050.0,
+      "completions/mean_length": 7618.875,
+      "completions/mean_terminated_length": 7034.53369140625,
+      "completions/min_length": 1030.0,
+      "completions/min_terminated_length": 1030.0,
+      "entropy": 0.9142575263977051,
+      "epoch": 0.1655933762649494,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0026741649489849806,
+      "learning_rate": 1e-05,
+      "loss": 0.0666,
+      "num_tokens": 139619287.0,
+      "reward": 0.2890625,
+      "reward_std": 0.2927239239215851,
+      "rewards/accuracy_reward/mean": 0.2890625,
+      "rewards/accuracy_reward/std": 0.45510825514793396,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998897314071655,
+      "sampling/importance_sampling_ratio/min": 0.005949751473963261,
+      "sampling/sampling_logp_difference/max": 5.124405860900879,
+      "sampling/sampling_logp_difference/mean": 0.020061582326889038,
+      "step": 180
+    },
+    {
+      "clip_ratio/high_max": 1.0512151675357018e-05,
+      "clip_ratio/high_mean": 2.6280379188392544e-06,
+      "clip_ratio/low_mean": 4.5301517502593924e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.792955542143318e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 16106.0,
+      "completions/max_terminated_length": 16106.0,
+      "completions/mean_length": 5333.875,
+      "completions/mean_terminated_length": 5333.875,
+      "completions/min_length": 1109.0,
+      "completions/min_terminated_length": 1109.0,
+      "entropy": 0.8107482865452766,
+      "epoch": 0.16651333946642136,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0027016003150492907,
+      "learning_rate": 1e-05,
+      "loss": 0.0544,
+      "num_tokens": 140318935.0,
+      "reward": 0.5703125,
+      "reward_std": 0.2556639611721039,
+      "rewards/accuracy_reward/mean": 0.5703125,
+      "rewards/accuracy_reward/std": 0.4969765841960907,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.000013828277588,
+      "sampling/importance_sampling_ratio/min": 0.006856904830783606,
+      "sampling/sampling_logp_difference/max": 4.982499122619629,
+      "sampling/sampling_logp_difference/mean": 0.017069874331355095,
+      "step": 181
+    },
+    {
+      "clip_ratio/high_max": 1.85085939392593e-05,
+      "clip_ratio/high_mean": 5.24943533264377e-06,
+      "clip_ratio/low_mean": 5.6120721524166584e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 6.137015702734061e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16050.0,
+      "completions/mean_length": 7443.3046875,
+      "completions/mean_terminated_length": 7154.89501953125,
+      "completions/min_length": 19.0,
+      "completions/min_terminated_length": 19.0,
+      "entropy": 0.9224414080381393,
+      "epoch": 0.16743330266789327,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.002655779244378209,
+      "learning_rate": 1e-05,
+      "loss": 0.0466,
+      "num_tokens": 141293534.0,
+      "reward": 0.234375,
+      "reward_std": 0.2398776262998581,
+      "rewards/accuracy_reward/mean": 0.234375,
+      "rewards/accuracy_reward/std": 0.42527204751968384,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999659061431885,
+      "sampling/importance_sampling_ratio/min": 0.00042018835665658116,
+      "sampling/sampling_logp_difference/max": 7.774807453155518,
+      "sampling/sampling_logp_difference/mean": 0.02006504125893116,
+      "step": 182
+    },
+    {
+      "clip_ratio/high_max": 1.494229445597739e-05,
+      "clip_ratio/high_mean": 3.7355736139943474e-06,
+      "clip_ratio/low_mean": 2.2748562741981004e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.6484136355975352e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15923.0,
+      "completions/max_terminated_length": 15923.0,
+      "completions/mean_length": 5646.6875,
+      "completions/mean_terminated_length": 5646.6875,
+      "completions/min_length": 342.0,
+      "completions/min_terminated_length": 342.0,
+      "entropy": 0.8945339694619179,
+      "epoch": 0.16835326586936522,
+      "frac_reward_zero_std": 0.625,
+      "grad_norm": 0.0016281780553981662,
+      "learning_rate": 1e-05,
+      "loss": 0.0288,
+      "num_tokens": 142037438.0,
+      "reward": 0.46875,
+      "reward_std": 0.17912296950817108,
+      "rewards/accuracy_reward/mean": 0.46875,
+      "rewards/accuracy_reward/std": 0.5009832978248596,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.000030517578125,
+      "sampling/importance_sampling_ratio/min": 0.0005717006279155612,
+      "sampling/sampling_logp_difference/max": 7.46689510345459,
+      "sampling/sampling_logp_difference/mean": 0.019336247816681862,
+      "step": 183
+    },
+    {
+      "clip_ratio/high_max": 3.335990868436056e-05,
+      "clip_ratio/high_mean": 8.33997717109014e-06,
+      "clip_ratio/low_mean": 3.5050728683927446e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.339070608239126e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14142.0,
+      "completions/mean_length": 6384.640625,
+      "completions/mean_terminated_length": 5892.86865234375,
+      "completions/min_length": 130.0,
+      "completions/min_terminated_length": 130.0,
+      "entropy": 0.840093269944191,
+      "epoch": 0.16927322907083717,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.002166559686884284,
+      "learning_rate": 1e-05,
+      "loss": 0.0011,
+      "num_tokens": 142873848.0,
+      "reward": 0.4765625,
+      "reward_std": 0.35506346821784973,
+      "rewards/accuracy_reward/mean": 0.4765625,
+      "rewards/accuracy_reward/std": 0.5014128684997559,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000462532043457,
+      "sampling/importance_sampling_ratio/min": 4.785555574926548e-06,
+      "sampling/sampling_logp_difference/max": 12.249908447265625,
+      "sampling/sampling_logp_difference/mean": 0.018109092488884926,
+      "step": 184
+    },
+    {
+      "clip_ratio/high_max": 1.541105484648142e-05,
+      "clip_ratio/high_mean": 3.852763711620355e-06,
+      "clip_ratio/low_mean": 4.0552770769863855e-05,
+      "clip_ratio/low_min": 7.133888630050933e-06,
+      "clip_ratio/region_mean": 4.440553459517105e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14828.0,
+      "completions/mean_length": 5775.0,
+      "completions/mean_terminated_length": 5691.46435546875,
+      "completions/min_length": 1147.0,
+      "completions/min_terminated_length": 1147.0,
+      "entropy": 0.8915362879633904,
+      "epoch": 0.1701931922723091,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0021932912059128284,
+      "learning_rate": 1e-05,
+      "loss": -0.0086,
+      "num_tokens": 143636152.0,
+      "reward": 0.4375,
+      "reward_std": 0.32325342297554016,
+      "rewards/accuracy_reward/mean": 0.4375,
+      "rewards/accuracy_reward/std": 0.49802759289741516,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.000008225440979,
+      "sampling/importance_sampling_ratio/min": 9.714113069492214e-09,
+      "sampling/sampling_logp_difference/max": 18.44968605041504,
+      "sampling/sampling_logp_difference/mean": 0.019278086721897125,
+      "step": 185
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 2.7509142171311396e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.7509142171311396e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15122.0,
+      "completions/mean_length": 6181.640625,
+      "completions/mean_terminated_length": 6019.69873046875,
+      "completions/min_length": 473.0,
+      "completions/min_terminated_length": 473.0,
+      "entropy": 1.0544511675834656,
+      "epoch": 0.17111315547378106,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.0022947140969336033,
+      "learning_rate": 1e-05,
+      "loss": 0.0242,
+      "num_tokens": 144447370.0,
+      "reward": 0.234375,
+      "reward_std": 0.2022808939218521,
+      "rewards/accuracy_reward/mean": 0.234375,
+      "rewards/accuracy_reward/std": 0.42527204751968384,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999147653579712,
+      "sampling/importance_sampling_ratio/min": 7.419757253046555e-08,
+      "sampling/sampling_logp_difference/max": 16.416534423828125,
+      "sampling/sampling_logp_difference/mean": 0.02050788700580597,
+      "step": 186
+    },
+    {
+      "clip_ratio/high_max": 1.5700999938417226e-05,
+      "clip_ratio/high_mean": 3.9252499846043065e-06,
+      "clip_ratio/low_mean": 2.4595847037289786e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.8521096965050674e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15824.0,
+      "completions/mean_length": 6542.3046875,
+      "completions/mean_terminated_length": 6306.1044921875,
+      "completions/min_length": 628.0,
+      "completions/min_terminated_length": 628.0,
+      "entropy": 0.933225467801094,
+      "epoch": 0.17203311867525298,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0034910975955426693,
+      "learning_rate": 1e-05,
+      "loss": 0.0977,
+      "num_tokens": 145303505.0,
+      "reward": 0.390625,
+      "reward_std": 0.30433881282806396,
+      "rewards/accuracy_reward/mean": 0.390625,
+      "rewards/accuracy_reward/std": 0.4898075461387634,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999945163726807,
+      "sampling/importance_sampling_ratio/min": 0.007213745731860399,
+      "sampling/sampling_logp_difference/max": 4.931766986846924,
+      "sampling/sampling_logp_difference/mean": 0.020022759214043617,
+      "step": 187
+    },
+    {
+      "clip_ratio/high_max": 6.0999414017715026e-06,
+      "clip_ratio/high_mean": 1.5249853504428756e-06,
+      "clip_ratio/low_mean": 2.61421698724007e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.7667155109156738e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15365.0,
+      "completions/mean_length": 5889.4765625,
+      "completions/mean_terminated_length": 5637.6083984375,
+      "completions/min_length": 99.0,
+      "completions/min_terminated_length": 99.0,
+      "entropy": 0.9649673849344254,
+      "epoch": 0.17295308187672492,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0024078311398625374,
+      "learning_rate": 1e-05,
+      "loss": 0.0391,
+      "num_tokens": 146082198.0,
+      "reward": 0.3359375,
+      "reward_std": 0.2698703408241272,
+      "rewards/accuracy_reward/mean": 0.3359375,
+      "rewards/accuracy_reward/std": 0.47417303919792175,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999341368675232,
+      "sampling/importance_sampling_ratio/min": 0.0008680344326421618,
+      "sampling/sampling_logp_difference/max": 7.04927921295166,
+      "sampling/sampling_logp_difference/mean": 0.02060198038816452,
+      "step": 188
+    },
+    {
+      "clip_ratio/high_max": 7.789618393871933e-06,
+      "clip_ratio/high_mean": 1.9474045984679833e-06,
+      "clip_ratio/low_mean": 3.6395756637830345e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.834316100892465e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16233.0,
+      "completions/mean_length": 5349.2421875,
+      "completions/mean_terminated_length": 5084.408203125,
+      "completions/min_length": 678.0,
+      "completions/min_terminated_length": 678.0,
+      "entropy": 0.8402756005525589,
+      "epoch": 0.17387304507819687,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0021191861014813185,
+      "learning_rate": 1e-05,
+      "loss": 0.1275,
+      "num_tokens": 146786245.0,
+      "reward": 0.4765625,
+      "reward_std": 0.2801200747489929,
+      "rewards/accuracy_reward/mean": 0.4765625,
+      "rewards/accuracy_reward/std": 0.5014128684997559,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999837875366211,
+      "sampling/importance_sampling_ratio/min": 3.763807762879878e-05,
+      "sampling/sampling_logp_difference/max": 10.187494277954102,
+      "sampling/sampling_logp_difference/mean": 0.017112664878368378,
+      "step": 189
+    },
+    {
+      "clip_ratio/high_max": 1.2461773394534248e-05,
+      "clip_ratio/high_mean": 3.115443348633562e-06,
+      "clip_ratio/low_mean": 5.095924211673264e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.4074685294835945e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15786.0,
+      "completions/mean_length": 7272.3203125,
+      "completions/mean_terminated_length": 7053.64013671875,
+      "completions/min_length": 1074.0,
+      "completions/min_terminated_length": 1074.0,
+      "entropy": 0.9627499282360077,
+      "epoch": 0.17479300827966882,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0022120666690170765,
+      "learning_rate": 1e-05,
+      "loss": 0.0079,
+      "num_tokens": 147737086.0,
+      "reward": 0.2890625,
+      "reward_std": 0.27304792404174805,
+      "rewards/accuracy_reward/mean": 0.2890625,
+      "rewards/accuracy_reward/std": 0.45510825514793396,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999538660049438,
+      "sampling/importance_sampling_ratio/min": 1.6960719221970066e-05,
+      "sampling/sampling_logp_difference/max": 10.984610557556152,
+      "sampling/sampling_logp_difference/mean": 0.0203307643532753,
+      "step": 190
+    },
+    {
+      "clip_ratio/high_max": 1.7891727566166082e-05,
+      "clip_ratio/high_mean": 4.472931891541521e-06,
+      "clip_ratio/low_mean": 5.616715043288423e-05,
+      "clip_ratio/low_min": 7.80031223257538e-06,
+      "clip_ratio/region_mean": 6.064008221073891e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16212.0,
+      "completions/mean_length": 6387.1875,
+      "completions/mean_terminated_length": 5895.54052734375,
+      "completions/min_length": 1310.0,
+      "completions/min_terminated_length": 1310.0,
+      "entropy": 0.9110158830881119,
+      "epoch": 0.17571297148114076,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0030851473566144705,
+      "learning_rate": 1e-05,
+      "loss": 0.1091,
+      "num_tokens": 148573782.0,
+      "reward": 0.40625,
+      "reward_std": 0.31246688961982727,
+      "rewards/accuracy_reward/mean": 0.40625,
+      "rewards/accuracy_reward/std": 0.4930621087551117,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.99997878074646,
+      "sampling/importance_sampling_ratio/min": 0.003961040172725916,
+      "sampling/sampling_logp_difference/max": 5.531248569488525,
+      "sampling/sampling_logp_difference/mean": 0.018049638718366623,
+      "step": 191
+    },
+    {
+      "clip_ratio/high_max": 1.6994396901282016e-05,
+      "clip_ratio/high_mean": 5.400205964178895e-06,
+      "clip_ratio/low_mean": 3.274822392995702e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.8148429439388565e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16176.0,
+      "completions/mean_length": 7267.59375,
+      "completions/mean_terminated_length": 7195.81103515625,
+      "completions/min_length": 653.0,
+      "completions/min_terminated_length": 653.0,
+      "entropy": 0.9254888147115707,
+      "epoch": 0.1766329346826127,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0020694085396826267,
+      "learning_rate": 1e-05,
+      "loss": 0.0462,
+      "num_tokens": 149521258.0,
+      "reward": 0.2734375,
+      "reward_std": 0.29719972610473633,
+      "rewards/accuracy_reward/mean": 0.2734375,
+      "rewards/accuracy_reward/std": 0.447474867105484,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999054670333862,
+      "sampling/importance_sampling_ratio/min": 7.411616934405174e-06,
+      "sampling/sampling_logp_difference/max": 11.812461853027344,
+      "sampling/sampling_logp_difference/mean": 0.01898832805454731,
+      "step": 192
+    },
+    {
+      "clip_ratio/high_max": 4.10414668294834e-06,
+      "clip_ratio/high_mean": 1.026036670737085e-06,
+      "clip_ratio/low_mean": 4.7441100377909606e-05,
+      "clip_ratio/low_min": 4.552241534838686e-06,
+      "clip_ratio/region_mean": 4.8467136821273016e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16076.0,
+      "completions/mean_length": 7100.1953125,
+      "completions/mean_terminated_length": 6952.83349609375,
+      "completions/min_length": 560.0,
+      "completions/min_terminated_length": 560.0,
+      "entropy": 0.8455610796809196,
+      "epoch": 0.17755289788408463,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.003085972974076867,
+      "learning_rate": 1e-05,
+      "loss": 0.0108,
+      "num_tokens": 150447923.0,
+      "reward": 0.25,
+      "reward_std": 0.23645778000354767,
+      "rewards/accuracy_reward/mean": 0.25,
+      "rewards/accuracy_reward/std": 0.434714138507843,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999178647994995,
+      "sampling/importance_sampling_ratio/min": 0.0011708807433024049,
+      "sampling/sampling_logp_difference/max": 6.749999046325684,
+      "sampling/sampling_logp_difference/mean": 0.01974140852689743,
+      "step": 193
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 1.6514521121280268e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 1.6514521121280268e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15535.0,
+      "completions/mean_length": 6626.4296875,
+      "completions/mean_terminated_length": 6549.5986328125,
+      "completions/min_length": 1746.0,
+      "completions/min_terminated_length": 1746.0,
+      "entropy": 1.0323699787259102,
+      "epoch": 0.17847286108555657,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.003505800850689411,
+      "learning_rate": 1e-05,
+      "loss": 0.0885,
+      "num_tokens": 151313834.0,
+      "reward": 0.390625,
+      "reward_std": 0.17176413536071777,
+      "rewards/accuracy_reward/mean": 0.390625,
+      "rewards/accuracy_reward/std": 0.4898075461387634,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999381303787231,
+      "sampling/importance_sampling_ratio/min": 2.8102756914449856e-05,
+      "sampling/sampling_logp_difference/max": 10.479642868041992,
+      "sampling/sampling_logp_difference/mean": 0.021082937717437744,
+      "step": 194
+    },
+    {
+      "clip_ratio/high_max": 2.006086378969485e-05,
+      "clip_ratio/high_mean": 5.890002398700744e-06,
+      "clip_ratio/low_mean": 3.503898199141986e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.092898473118112e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15595.0,
+      "completions/mean_length": 7093.109375,
+      "completions/mean_terminated_length": 6870.12841796875,
+      "completions/min_length": 69.0,
+      "completions/min_terminated_length": 69.0,
+      "entropy": 1.0206764563918114,
+      "epoch": 0.17939282428702852,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.002495395252481103,
+      "learning_rate": 1e-05,
+      "loss": 0.0308,
+      "num_tokens": 152238192.0,
+      "reward": 0.2890625,
+      "reward_std": 0.31800350546836853,
+      "rewards/accuracy_reward/mean": 0.2890625,
+      "rewards/accuracy_reward/std": 0.45510825514793396,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999728798866272,
+      "sampling/importance_sampling_ratio/min": 9.536534344078973e-05,
+      "sampling/sampling_logp_difference/max": 9.257795333862305,
+      "sampling/sampling_logp_difference/mean": 0.020610272884368896,
+      "step": 195
+    },
+    {
+      "clip_ratio/high_max": 3.2352409107261337e-06,
+      "clip_ratio/high_mean": 8.088102276815334e-07,
+      "clip_ratio/low_mean": 4.056704699451075e-05,
+      "clip_ratio/low_min": 1.1648833606159315e-05,
+      "clip_ratio/region_mean": 4.1375856994818605e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14191.0,
+      "completions/mean_length": 6795.71875,
+      "completions/mean_terminated_length": 6486.4189453125,
+      "completions/min_length": 424.0,
+      "completions/min_terminated_length": 424.0,
+      "entropy": 0.8927837759256363,
+      "epoch": 0.18031278748850046,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0014066790463402867,
+      "learning_rate": 1e-05,
+      "loss": -0.0031,
+      "num_tokens": 153131828.0,
+      "reward": 0.3359375,
+      "reward_std": 0.2409384548664093,
+      "rewards/accuracy_reward/mean": 0.3359375,
+      "rewards/accuracy_reward/std": 0.47417303919792175,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998855590820312,
+      "sampling/importance_sampling_ratio/min": 5.093755135021638e-06,
+      "sampling/sampling_logp_difference/max": 12.187495231628418,
+      "sampling/sampling_logp_difference/mean": 0.01874586008489132,
+      "step": 196
+    },
+    {
+      "clip_ratio/high_max": 1.5244630048982799e-05,
+      "clip_ratio/high_mean": 3.8111575122456998e-06,
+      "clip_ratio/low_mean": 3.655197178886738e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.03631290737394e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15831.0,
+      "completions/mean_length": 7075.1015625,
+      "completions/mean_terminated_length": 6617.28662109375,
+      "completions/min_length": 813.0,
+      "completions/min_terminated_length": 813.0,
+      "entropy": 0.8989318311214447,
+      "epoch": 0.1812327506899724,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0017937121447175741,
+      "learning_rate": 1e-05,
+      "loss": 0.0359,
+      "num_tokens": 154057097.0,
+      "reward": 0.3984375,
+      "reward_std": 0.23068872094154358,
+      "rewards/accuracy_reward/mean": 0.3984375,
+      "rewards/accuracy_reward/std": 0.4915000796318054,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998950958251953,
+      "sampling/importance_sampling_ratio/min": 0.00021659507183358073,
+      "sampling/sampling_logp_difference/max": 8.437480926513672,
+      "sampling/sampling_logp_difference/mean": 0.01890135183930397,
+      "step": 197
+    },
+    {
+      "clip_ratio/high_max": 1.4074375030759256e-05,
+      "clip_ratio/high_mean": 4.977033995601232e-06,
+      "clip_ratio/low_mean": 3.2670792506905855e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.764782627513341e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14100.0,
+      "completions/mean_length": 7120.0,
+      "completions/mean_terminated_length": 6743.41455078125,
+      "completions/min_length": 78.0,
+      "completions/min_terminated_length": 78.0,
+      "entropy": 0.8758384585380554,
+      "epoch": 0.18215271389144433,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.003410576842725277,
+      "learning_rate": 1e-05,
+      "loss": 0.0536,
+      "num_tokens": 154988585.0,
+      "reward": 0.3984375,
+      "reward_std": 0.2845958471298218,
+      "rewards/accuracy_reward/mean": 0.3984375,
+      "rewards/accuracy_reward/std": 0.4915000796318054,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999953508377075,
+      "sampling/importance_sampling_ratio/min": 0.003589102067053318,
+      "sampling/sampling_logp_difference/max": 5.629853248596191,
+      "sampling/sampling_logp_difference/mean": 0.018400676548480988,
+      "step": 198
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 1.977112736994968e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 1.977112736994968e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15821.0,
+      "completions/mean_length": 6590.6796875,
+      "completions/mean_terminated_length": 6513.56689453125,
+      "completions/min_length": 23.0,
+      "completions/min_terminated_length": 23.0,
+      "entropy": 0.9243742749094963,
+      "epoch": 0.18307267709291627,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.003304310142993927,
+      "learning_rate": 1e-05,
+      "loss": 0.0585,
+      "num_tokens": 155851000.0,
+      "reward": 0.3984375,
+      "reward_std": 0.29036980867385864,
+      "rewards/accuracy_reward/mean": 0.3984375,
+      "rewards/accuracy_reward/std": 0.4915000796318054,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999579787254333,
+      "sampling/importance_sampling_ratio/min": 1.2693599273916334e-06,
+      "sampling/sampling_logp_difference/max": 13.576997756958008,
+      "sampling/sampling_logp_difference/mean": 0.01959652081131935,
+      "step": 199
+    },
+    {
+      "clip_ratio/high_max": 1.1435367014200892e-05,
+      "clip_ratio/high_mean": 2.858841753550223e-06,
+      "clip_ratio/low_mean": 4.7742656533955596e-05,
+      "clip_ratio/low_min": 8.646529749967158e-06,
+      "clip_ratio/region_mean": 5.0601498060132144e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16115.0,
+      "completions/mean_length": 6999.484375,
+      "completions/mean_terminated_length": 6696.7578125,
+      "completions/min_length": 445.0,
+      "completions/min_terminated_length": 445.0,
+      "entropy": 0.843244343996048,
+      "epoch": 0.18399264029438822,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0023830258287489414,
+      "learning_rate": 1e-05,
+      "loss": 0.1142,
+      "num_tokens": 156766782.0,
+      "reward": 0.359375,
+      "reward_std": 0.2885475754737854,
+      "rewards/accuracy_reward/mean": 0.359375,
+      "rewards/accuracy_reward/std": 0.481702595949173,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998635053634644,
+      "sampling/importance_sampling_ratio/min": 0.00014761318743694574,
+      "sampling/sampling_logp_difference/max": 8.820915222167969,
+      "sampling/sampling_logp_difference/mean": 0.018434934318065643,
+      "step": 200
+    },
+    {
+      "clip_ratio/high_max": 2.5114631171163637e-05,
+      "clip_ratio/high_mean": 7.040741365926806e-06,
+      "clip_ratio/low_mean": 5.3607667723554187e-05,
+      "clip_ratio/low_min": 9.219345429301029e-06,
+      "clip_ratio/region_mean": 6.064840863473364e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14986.0,
+      "completions/mean_length": 6407.5,
+      "completions/mean_terminated_length": 6249.14306640625,
+      "completions/min_length": 351.0,
+      "completions/min_terminated_length": 351.0,
+      "entropy": 0.9549195989966393,
+      "epoch": 0.18491260349586017,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0024427250027656555,
+      "learning_rate": 1e-05,
+      "loss": 0.0795,
+      "num_tokens": 157606126.0,
+      "reward": 0.3515625,
+      "reward_std": 0.32879000902175903,
+      "rewards/accuracy_reward/mean": 0.3515625,
+      "rewards/accuracy_reward/std": 0.4793342351913452,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999966025352478,
+      "sampling/importance_sampling_ratio/min": 0.0002305622911080718,
+      "sampling/sampling_logp_difference/max": 8.37498950958252,
+      "sampling/sampling_logp_difference/mean": 0.0192743968218565,
+      "step": 201
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 2.928529067958152e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.928529067958152e-05,
+      "completions/clipped_ratio": 0.0703125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15519.0,
+      "completions/mean_length": 6638.390625,
+      "completions/mean_terminated_length": 5901.328125,
+      "completions/min_length": 56.0,
+      "completions/min_terminated_length": 56.0,
+      "entropy": 0.9070822075009346,
+      "epoch": 0.1858325666973321,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.002024515997618437,
+      "learning_rate": 1e-05,
+      "loss": 0.0604,
+      "num_tokens": 158474248.0,
+      "reward": 0.4140625,
+      "reward_std": 0.28117600083351135,
+      "rewards/accuracy_reward/mean": 0.4140625,
+      "rewards/accuracy_reward/std": 0.49449479579925537,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999830722808838,
+      "sampling/importance_sampling_ratio/min": 0.0036068728659301996,
+      "sampling/sampling_logp_difference/max": 5.624914169311523,
+      "sampling/sampling_logp_difference/mean": 0.01955476775765419,
+      "step": 202
+    },
+    {
+      "clip_ratio/high_max": 8.365173471247545e-06,
+      "clip_ratio/high_mean": 2.091293367811886e-06,
+      "clip_ratio/low_mean": 4.1470637825113954e-05,
+      "clip_ratio/low_min": 4.027710474474588e-06,
+      "clip_ratio/region_mean": 4.356193130661268e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15708.0,
+      "completions/mean_length": 7324.546875,
+      "completions/mean_terminated_length": 6878.99951171875,
+      "completions/min_length": 571.0,
+      "completions/min_terminated_length": 571.0,
+      "entropy": 0.9108889549970627,
+      "epoch": 0.18675252989880406,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0022787705529481173,
+      "learning_rate": 1e-05,
+      "loss": 0.0616,
+      "num_tokens": 159434350.0,
+      "reward": 0.3359375,
+      "reward_std": 0.26515230536460876,
+      "rewards/accuracy_reward/mean": 0.3359375,
+      "rewards/accuracy_reward/std": 0.47417303919792175,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999351501464844,
+      "sampling/importance_sampling_ratio/min": 0.03948089852929115,
+      "sampling/sampling_logp_difference/max": 3.231938362121582,
+      "sampling/sampling_logp_difference/mean": 0.019122496247291565,
+      "step": 203
+    },
+    {
+      "clip_ratio/high_max": 8.65733409227687e-06,
+      "clip_ratio/high_mean": 2.1643335230692173e-06,
+      "clip_ratio/low_mean": 3.456336048657249e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.672769389595487e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 13983.0,
+      "completions/mean_length": 5520.4453125,
+      "completions/mean_terminated_length": 5434.9052734375,
+      "completions/min_length": 73.0,
+      "completions/min_terminated_length": 73.0,
+      "entropy": 0.8982062339782715,
+      "epoch": 0.18767249310027598,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0026195270475000143,
+      "learning_rate": 1e-05,
+      "loss": 0.049,
+      "num_tokens": 160163055.0,
+      "reward": 0.4375,
+      "reward_std": 0.24831004440784454,
+      "rewards/accuracy_reward/mean": 0.4375,
+      "rewards/accuracy_reward/std": 0.49802759289741516,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998810291290283,
+      "sampling/importance_sampling_ratio/min": 0.0005541297141462564,
+      "sampling/sampling_logp_difference/max": 7.498111724853516,
+      "sampling/sampling_logp_difference/mean": 0.019064132124185562,
+      "step": 204
+    },
+    {
+      "clip_ratio/high_max": 1.8376186289970065e-05,
+      "clip_ratio/high_mean": 6.650576210631698e-06,
+      "clip_ratio/low_mean": 4.059042771586974e-05,
+      "clip_ratio/low_min": 5.350111223378917e-06,
+      "clip_ratio/region_mean": 4.724100449493562e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15267.0,
+      "completions/max_terminated_length": 15267.0,
+      "completions/mean_length": 6846.515625,
+      "completions/mean_terminated_length": 6846.515625,
+      "completions/min_length": 63.0,
+      "completions/min_terminated_length": 63.0,
+      "entropy": 0.9657742157578468,
+      "epoch": 0.18859245630174792,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0014831912703812122,
+      "learning_rate": 1e-05,
+      "loss": 0.006,
+      "num_tokens": 161057657.0,
+      "reward": 0.296875,
+      "reward_std": 0.27198708057403564,
+      "rewards/accuracy_reward/mean": 0.296875,
+      "rewards/accuracy_reward/std": 0.45867621898651123,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999252557754517,
+      "sampling/importance_sampling_ratio/min": 6.252834282349795e-05,
+      "sampling/sampling_logp_difference/max": 9.679890632629395,
+      "sampling/sampling_logp_difference/mean": 0.020372584462165833,
+      "step": 205
+    },
+    {
+      "clip_ratio/high_max": 1.658901419432368e-05,
+      "clip_ratio/high_mean": 4.14725354858092e-06,
+      "clip_ratio/low_mean": 4.473214539757464e-05,
+      "clip_ratio/low_min": 2.9674999950657366e-06,
+      "clip_ratio/region_mean": 4.887939894615556e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16370.0,
+      "completions/mean_length": 6946.8984375,
+      "completions/mean_terminated_length": 6642.4755859375,
+      "completions/min_length": 1133.0,
+      "completions/min_terminated_length": 1133.0,
+      "entropy": 0.8490508273243904,
+      "epoch": 0.18951241950321987,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0017962189158424735,
+      "learning_rate": 1e-05,
+      "loss": 0.0696,
+      "num_tokens": 161966356.0,
+      "reward": 0.4296875,
+      "reward_std": 0.33114415407180786,
+      "rewards/accuracy_reward/mean": 0.4296875,
+      "rewards/accuracy_reward/std": 0.4969765841960907,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999545216560364,
+      "sampling/importance_sampling_ratio/min": 7.035569433355704e-05,
+      "sampling/sampling_logp_difference/max": 9.561946868896484,
+      "sampling/sampling_logp_difference/mean": 0.019146796315908432,
+      "step": 206
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 3.22491199540309e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.22491199540309e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15123.0,
+      "completions/mean_length": 6618.9765625,
+      "completions/mean_terminated_length": 6463.9765625,
+      "completions/min_length": 529.0,
+      "completions/min_terminated_length": 529.0,
+      "entropy": 0.9541772454977036,
+      "epoch": 0.19043238270469182,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0017619321588426828,
+      "learning_rate": 1e-05,
+      "loss": 0.0509,
+      "num_tokens": 162836705.0,
+      "reward": 0.390625,
+      "reward_std": 0.2130674123764038,
+      "rewards/accuracy_reward/mean": 0.390625,
+      "rewards/accuracy_reward/std": 0.4898075461387634,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999436140060425,
+      "sampling/importance_sampling_ratio/min": 4.2106199771296815e-07,
+      "sampling/sampling_logp_difference/max": 14.680485725402832,
+      "sampling/sampling_logp_difference/mean": 0.020236656069755554,
+      "step": 207
+    },
+    {
+      "clip_ratio/high_max": 1.6846054222696694e-05,
+      "clip_ratio/high_mean": 4.211513555674173e-06,
+      "clip_ratio/low_mean": 3.877300162002939e-05,
+      "clip_ratio/low_min": 4.230834292684449e-06,
+      "clip_ratio/region_mean": 4.298451551676408e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 12469.0,
+      "completions/mean_length": 5485.71875,
+      "completions/mean_terminated_length": 5312.73046875,
+      "completions/min_length": 104.0,
+      "completions/min_terminated_length": 104.0,
+      "entropy": 0.8888534903526306,
+      "epoch": 0.19135234590616376,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.002670915797352791,
+      "learning_rate": 1e-05,
+      "loss": 0.0709,
+      "num_tokens": 163558197.0,
+      "reward": 0.46875,
+      "reward_std": 0.3145885467529297,
+      "rewards/accuracy_reward/mean": 0.46875,
+      "rewards/accuracy_reward/std": 0.5009832978248596,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000442266464233,
+      "sampling/importance_sampling_ratio/min": 0.0005042250850237906,
+      "sampling/sampling_logp_difference/max": 7.592487812042236,
+      "sampling/sampling_logp_difference/mean": 0.019581373780965805,
+      "step": 208
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 3.6889288480779214e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.6889288480779214e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16184.0,
+      "completions/mean_length": 4345.171875,
+      "completions/mean_terminated_length": 4250.3779296875,
+      "completions/min_length": 68.0,
+      "completions/min_terminated_length": 68.0,
+      "entropy": 0.8308270424604416,
+      "epoch": 0.1922723091076357,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.004005427472293377,
+      "learning_rate": 1e-05,
+      "loss": 0.1072,
+      "num_tokens": 164133499.0,
+      "reward": 0.578125,
+      "reward_std": 0.31642353534698486,
+      "rewards/accuracy_reward/mean": 0.578125,
+      "rewards/accuracy_reward/std": 0.4957992732524872,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999247193336487,
+      "sampling/importance_sampling_ratio/min": 0.022981969639658928,
+      "sampling/sampling_logp_difference/max": 3.773045301437378,
+      "sampling/sampling_logp_difference/mean": 0.017508968710899353,
+      "step": 209
+    },
+    {
+      "clip_ratio/high_max": 1.2997116300539346e-05,
+      "clip_ratio/high_mean": 3.2492790751348366e-06,
+      "clip_ratio/low_mean": 2.723402121773688e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.0483300406558556e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15509.0,
+      "completions/mean_length": 5227.296875,
+      "completions/mean_terminated_length": 5050.20654296875,
+      "completions/min_length": 5.0,
+      "completions/min_terminated_length": 5.0,
+      "entropy": 0.9231975972652435,
+      "epoch": 0.19319227230910763,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0031033784616738558,
+      "learning_rate": 1e-05,
+      "loss": 0.0179,
+      "num_tokens": 164823681.0,
+      "reward": 0.4765625,
+      "reward_std": 0.29249146580696106,
+      "rewards/accuracy_reward/mean": 0.4765625,
+      "rewards/accuracy_reward/std": 0.5014128684997559,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999896764755249,
+      "sampling/importance_sampling_ratio/min": 0.0021342060063034296,
+      "sampling/sampling_logp_difference/max": 6.149660587310791,
+      "sampling/sampling_logp_difference/mean": 0.019171088933944702,
+      "step": 210
+    },
+    {
+      "clip_ratio/high_max": 2.0835890609305352e-05,
+      "clip_ratio/high_mean": 5.208972652326338e-06,
+      "clip_ratio/low_mean": 2.9314877565411734e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.452385044511175e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14160.0,
+      "completions/mean_length": 6473.4765625,
+      "completions/mean_terminated_length": 6316.1669921875,
+      "completions/min_length": 726.0,
+      "completions/min_terminated_length": 726.0,
+      "entropy": 0.9061874598264694,
+      "epoch": 0.19411223551057957,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.003495733719319105,
+      "learning_rate": 1e-05,
+      "loss": 0.0785,
+      "num_tokens": 165668798.0,
+      "reward": 0.4765625,
+      "reward_std": 0.3469353914260864,
+      "rewards/accuracy_reward/mean": 0.4765625,
+      "rewards/accuracy_reward/std": 0.5014128684997559,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000354051589966,
+      "sampling/importance_sampling_ratio/min": 0.0004697878030128777,
+      "sampling/sampling_logp_difference/max": 7.663229465484619,
+      "sampling/sampling_logp_difference/mean": 0.018978482112288475,
+      "step": 211
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 3.991967162164656e-05,
+      "clip_ratio/low_min": 6.304534053924726e-06,
+      "clip_ratio/region_mean": 3.991967162164656e-05,
+      "completions/clipped_ratio": 0.0546875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14659.0,
+      "completions/mean_length": 7140.1953125,
+      "completions/mean_terminated_length": 6605.4296875,
+      "completions/min_length": 141.0,
+      "completions/min_terminated_length": 141.0,
+      "entropy": 0.9605444446206093,
+      "epoch": 0.19503219871205152,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.002381941769272089,
+      "learning_rate": 1e-05,
+      "loss": 0.027,
+      "num_tokens": 166603375.0,
+      "reward": 0.3046875,
+      "reward_std": 0.27776598930358887,
+      "rewards/accuracy_reward/mean": 0.3046875,
+      "rewards/accuracy_reward/std": 0.46208351850509644,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999864935874939,
+      "sampling/importance_sampling_ratio/min": 0.00043123820796608925,
+      "sampling/sampling_logp_difference/max": 7.748849868774414,
+      "sampling/sampling_logp_difference/mean": 0.021141134202480316,
+      "step": 212
+    },
+    {
+      "clip_ratio/high_max": 1.4948576790629886e-05,
+      "clip_ratio/high_mean": 3.7371441976574715e-06,
+      "clip_ratio/low_mean": 3.4953729482367635e-05,
+      "clip_ratio/low_min": 3.991060111729894e-06,
+      "clip_ratio/region_mean": 3.869087413477246e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 13770.0,
+      "completions/mean_length": 5304.46875,
+      "completions/mean_terminated_length": 5038.56005859375,
+      "completions/min_length": 342.0,
+      "completions/min_terminated_length": 342.0,
+      "entropy": 0.9176690131425858,
+      "epoch": 0.19595216191352346,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0040566748939454556,
+      "learning_rate": 1e-05,
+      "loss": 0.0534,
+      "num_tokens": 167302275.0,
+      "reward": 0.4296875,
+      "reward_std": 0.33114415407180786,
+      "rewards/accuracy_reward/mean": 0.4296875,
+      "rewards/accuracy_reward/std": 0.4969765841960907,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999827742576599,
+      "sampling/importance_sampling_ratio/min": 5.001809313398553e-07,
+      "sampling/sampling_logp_difference/max": 14.508296012878418,
+      "sampling/sampling_logp_difference/mean": 0.018822530284523964,
+      "step": 213
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 2.653866999935417e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.653866999935417e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15791.0,
+      "completions/mean_length": 5796.5,
+      "completions/mean_terminated_length": 5542.400390625,
+      "completions/min_length": 407.0,
+      "completions/min_terminated_length": 407.0,
+      "entropy": 0.9230027198791504,
+      "epoch": 0.1968721251149954,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0021502040326595306,
+      "learning_rate": 1e-05,
+      "loss": 0.0737,
+      "num_tokens": 168063627.0,
+      "reward": 0.3828125,
+      "reward_std": 0.3158818483352661,
+      "rewards/accuracy_reward/mean": 0.3828125,
+      "rewards/accuracy_reward/std": 0.4879830479621887,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999223351478577,
+      "sampling/importance_sampling_ratio/min": 0.009504453279078007,
+      "sampling/sampling_logp_difference/max": 4.655994892120361,
+      "sampling/sampling_logp_difference/mean": 0.01985779032111168,
+      "step": 214
+    },
+    {
+      "clip_ratio/high_max": 1.0863841453101486e-05,
+      "clip_ratio/high_mean": 2.7159603632753715e-06,
+      "clip_ratio/low_mean": 2.4175752741939505e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.6891713218901714e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14814.0,
+      "completions/mean_length": 6135.4921875,
+      "completions/mean_terminated_length": 6054.79541015625,
+      "completions/min_length": 1259.0,
+      "completions/min_terminated_length": 1259.0,
+      "entropy": 0.869445689022541,
+      "epoch": 0.19779208831646733,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0027786416467279196,
+      "learning_rate": 1e-05,
+      "loss": 0.0139,
+      "num_tokens": 168867858.0,
+      "reward": 0.4609375,
+      "reward_std": 0.3366856575012207,
+      "rewards/accuracy_reward/mean": 0.4609375,
+      "rewards/accuracy_reward/std": 0.5004304051399231,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999550580978394,
+      "sampling/importance_sampling_ratio/min": 2.6089865059475414e-05,
+      "sampling/sampling_logp_difference/max": 10.553963661193848,
+      "sampling/sampling_logp_difference/mean": 0.018514130264520645,
+      "step": 215
+    },
+    {
+      "clip_ratio/high_max": 4.36788013757905e-06,
+      "clip_ratio/high_mean": 1.0919700343947625e-06,
+      "clip_ratio/low_mean": 1.993327998661698e-06,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.0852980330564606e-06,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15691.0,
+      "completions/mean_length": 6268.2421875,
+      "completions/mean_terminated_length": 6025.46435546875,
+      "completions/min_length": 627.0,
+      "completions/min_terminated_length": 627.0,
+      "entropy": 0.951081782579422,
+      "epoch": 0.19871205151793928,
+      "frac_reward_zero_std": 0.75,
+      "grad_norm": 0.0007328780484385788,
+      "learning_rate": 1e-05,
+      "loss": 0.0188,
+      "num_tokens": 169689969.0,
+      "reward": 0.3828125,
+      "reward_std": 0.10994865000247955,
+      "rewards/accuracy_reward/mean": 0.3828125,
+      "rewards/accuracy_reward/std": 0.4879830479621887,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000133514404297,
+      "sampling/importance_sampling_ratio/min": 1.6650999896228313e-05,
+      "sampling/sampling_logp_difference/max": 11.003040313720703,
+      "sampling/sampling_logp_difference/mean": 0.02005261555314064,
+      "step": 216
+    },
+    {
+      "clip_ratio/high_max": 2.131336282218399e-05,
+      "clip_ratio/high_mean": 5.3283407055459975e-06,
+      "clip_ratio/low_mean": 3.5254403428552905e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.058274430462916e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 13861.0,
+      "completions/mean_length": 5440.8984375,
+      "completions/mean_terminated_length": 5354.732421875,
+      "completions/min_length": 413.0,
+      "completions/min_terminated_length": 413.0,
+      "entropy": 0.8271932750940323,
+      "epoch": 0.19963201471941122,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0034721922129392624,
+      "learning_rate": 1e-05,
+      "loss": -0.0245,
+      "num_tokens": 170409292.0,
+      "reward": 0.53125,
+      "reward_std": 0.30327308177948,
+      "rewards/accuracy_reward/mean": 0.53125,
+      "rewards/accuracy_reward/std": 0.5009832978248596,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998912811279297,
+      "sampling/importance_sampling_ratio/min": 1.8372484191786498e-05,
+      "sampling/sampling_logp_difference/max": 10.904656410217285,
+      "sampling/sampling_logp_difference/mean": 0.019136395305395126,
+      "step": 217
+    },
+    {
+      "clip_ratio/high_max": 1.2339016848272877e-05,
+      "clip_ratio/high_mean": 4.13687178024702e-06,
+      "clip_ratio/low_mean": 2.156280152121326e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.569967330146028e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15086.0,
+      "completions/mean_length": 6671.046875,
+      "completions/mean_terminated_length": 6594.56689453125,
+      "completions/min_length": 748.0,
+      "completions/min_terminated_length": 748.0,
+      "entropy": 0.9659745842218399,
+      "epoch": 0.20055197792088317,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0027575206477195024,
+      "learning_rate": 1e-05,
+      "loss": 0.0286,
+      "num_tokens": 171280714.0,
+      "reward": 0.375,
+      "reward_std": 0.2109457552433014,
+      "rewards/accuracy_reward/mean": 0.375,
+      "rewards/accuracy_reward/std": 0.4860251843929291,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999411702156067,
+      "sampling/importance_sampling_ratio/min": 1.5700872609158978e-05,
+      "sampling/sampling_logp_difference/max": 11.06179428100586,
+      "sampling/sampling_logp_difference/mean": 0.019089506939053535,
+      "step": 218
+    },
+    {
+      "clip_ratio/high_max": 1.4603458112105727e-05,
+      "clip_ratio/high_mean": 3.650864528026432e-06,
+      "clip_ratio/low_mean": 3.2977761520669446e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.662862599185246e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15752.0,
+      "completions/mean_length": 7781.5546875,
+      "completions/mean_terminated_length": 7504.05615234375,
+      "completions/min_length": 429.0,
+      "completions/min_terminated_length": 429.0,
+      "entropy": 1.1691131889820099,
+      "epoch": 0.2014719411223551,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.0012711051385849714,
+      "learning_rate": 1e-05,
+      "loss": 0.0115,
+      "num_tokens": 172302489.0,
+      "reward": 0.109375,
+      "reward_std": 0.1751839816570282,
+      "rewards/accuracy_reward/mean": 0.109375,
+      "rewards/accuracy_reward/std": 0.31333550810813904,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998820424079895,
+      "sampling/importance_sampling_ratio/min": 0.005086081102490425,
+      "sampling/sampling_logp_difference/max": 5.281247615814209,
+      "sampling/sampling_logp_difference/mean": 0.023309212177991867,
+      "step": 219
+    },
+    {
+      "clip_ratio/high_max": 6.842087486802484e-06,
+      "clip_ratio/high_mean": 1.710521871700621e-06,
+      "clip_ratio/low_mean": 4.5269940528669395e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.6980462457213434e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14891.0,
+      "completions/mean_length": 6489.96875,
+      "completions/mean_terminated_length": 6332.9208984375,
+      "completions/min_length": 556.0,
+      "completions/min_terminated_length": 556.0,
+      "entropy": 0.9354017227888107,
+      "epoch": 0.20239190432382706,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0016933141741901636,
+      "learning_rate": 1e-05,
+      "loss": 0.0156,
+      "num_tokens": 173149653.0,
+      "reward": 0.484375,
+      "reward_std": 0.32325342297554016,
+      "rewards/accuracy_reward/mean": 0.484375,
+      "rewards/accuracy_reward/std": 0.5017194747924805,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999572038650513,
+      "sampling/importance_sampling_ratio/min": 0.008998609147965908,
+      "sampling/sampling_logp_difference/max": 4.7106852531433105,
+      "sampling/sampling_logp_difference/mean": 0.019165027886629105,
+      "step": 220
+    },
+    {
+      "clip_ratio/high_max": 2.444740721330163e-05,
+      "clip_ratio/high_mean": 6.111851803325408e-06,
+      "clip_ratio/low_mean": 3.0998270403870265e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.711012095664046e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 14943.0,
+      "completions/max_terminated_length": 14943.0,
+      "completions/mean_length": 6309.75,
+      "completions/mean_terminated_length": 6309.75,
+      "completions/min_length": 474.0,
+      "completions/min_terminated_length": 474.0,
+      "entropy": 1.012483686208725,
+      "epoch": 0.20331186752529898,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0024940327275544405,
+      "learning_rate": 1e-05,
+      "loss": 0.0552,
+      "num_tokens": 173976797.0,
+      "reward": 0.4375,
+      "reward_std": 0.2790592610836029,
+      "rewards/accuracy_reward/mean": 0.4375,
+      "rewards/accuracy_reward/std": 0.49802759289741516,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999861121177673,
+      "sampling/importance_sampling_ratio/min": 0.0018720829393714666,
+      "sampling/sampling_logp_difference/max": 6.280703544616699,
+      "sampling/sampling_logp_difference/mean": 0.020797956734895706,
+      "step": 221
+    },
+    {
+      "clip_ratio/high_max": 1.1112337460872368e-05,
+      "clip_ratio/high_mean": 3.5388877677178243e-06,
+      "clip_ratio/low_mean": 1.7024583712554886e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.056347148027271e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16362.0,
+      "completions/mean_length": 7574.984375,
+      "completions/mean_terminated_length": 7363.568359375,
+      "completions/min_length": 598.0,
+      "completions/min_terminated_length": 598.0,
+      "entropy": 0.9144782647490501,
+      "epoch": 0.20423183072677092,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.002748408354818821,
+      "learning_rate": 1e-05,
+      "loss": 0.0588,
+      "num_tokens": 174965259.0,
+      "reward": 0.2734375,
+      "reward_std": 0.25224411487579346,
+      "rewards/accuracy_reward/mean": 0.2734375,
+      "rewards/accuracy_reward/std": 0.447474867105484,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000108480453491,
+      "sampling/importance_sampling_ratio/min": 0.005681300535798073,
+      "sampling/sampling_logp_difference/max": 5.170575141906738,
+      "sampling/sampling_logp_difference/mean": 0.019229793921113014,
+      "step": 222
+    },
+    {
+      "clip_ratio/high_max": 1.4946090004741563e-05,
+      "clip_ratio/high_mean": 3.736522501185391e-06,
+      "clip_ratio/low_mean": 3.722507381098694e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.096159636901575e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15365.0,
+      "completions/mean_length": 6962.7734375,
+      "completions/mean_terminated_length": 6499.43408203125,
+      "completions/min_length": 780.0,
+      "completions/min_terminated_length": 780.0,
+      "entropy": 0.9248140156269073,
+      "epoch": 0.20515179392824287,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0020343128126114607,
+      "learning_rate": 1e-05,
+      "loss": 0.0714,
+      "num_tokens": 175876446.0,
+      "reward": 0.421875,
+      "reward_std": 0.3156445026397705,
+      "rewards/accuracy_reward/mean": 0.421875,
+      "rewards/accuracy_reward/std": 0.4957992732524872,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999679327011108,
+      "sampling/importance_sampling_ratio/min": 0.0001609467581147328,
+      "sampling/sampling_logp_difference/max": 8.734436988830566,
+      "sampling/sampling_logp_difference/mean": 0.01860032044351101,
+      "step": 223
+    },
+    {
+      "clip_ratio/high_max": 4.226114015182247e-06,
+      "clip_ratio/high_mean": 1.0565285037955618e-06,
+      "clip_ratio/low_mean": 3.189400638348161e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.295053488727717e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14978.0,
+      "completions/mean_length": 6422.28125,
+      "completions/mean_terminated_length": 6264.1591796875,
+      "completions/min_length": 117.0,
+      "completions/min_terminated_length": 117.0,
+      "entropy": 0.7786787301301956,
+      "epoch": 0.20607175712971482,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0029119597747921944,
+      "learning_rate": 1e-05,
+      "loss": 0.1116,
+      "num_tokens": 176717226.0,
+      "reward": 0.578125,
+      "reward_std": 0.27328526973724365,
+      "rewards/accuracy_reward/mean": 0.578125,
+      "rewards/accuracy_reward/std": 0.4957992732524872,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999918937683105,
+      "sampling/importance_sampling_ratio/min": 0.0006287595024332404,
+      "sampling/sampling_logp_difference/max": 7.371761798858643,
+      "sampling/sampling_logp_difference/mean": 0.01786171644926071,
+      "step": 224
+    },
+    {
+      "clip_ratio/high_max": 5.4112551879370585e-06,
+      "clip_ratio/high_mean": 1.3528137969842646e-06,
+      "clip_ratio/low_mean": 2.103693077515345e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.2389744572137715e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16030.0,
+      "completions/mean_length": 6662.65625,
+      "completions/mean_terminated_length": 6508.349609375,
+      "completions/min_length": 486.0,
+      "completions/min_terminated_length": 486.0,
+      "entropy": 0.9501350447535515,
+      "epoch": 0.20699172033118676,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.0027519147843122482,
+      "learning_rate": 1e-05,
+      "loss": 0.0204,
+      "num_tokens": 177586766.0,
+      "reward": 0.421875,
+      "reward_std": 0.21382881700992584,
+      "rewards/accuracy_reward/mean": 0.421875,
+      "rewards/accuracy_reward/std": 0.4957992732524872,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000051259994507,
+      "sampling/importance_sampling_ratio/min": 2.507045428501442e-05,
+      "sampling/sampling_logp_difference/max": 10.593820571899414,
+      "sampling/sampling_logp_difference/mean": 0.020679686218500137,
+      "step": 225
+    },
+    {
+      "clip_ratio/high_max": 3.2487785119883483e-06,
+      "clip_ratio/high_mean": 8.121946279970871e-07,
+      "clip_ratio/low_mean": 5.783435085504607e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.8646545539886574e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15419.0,
+      "completions/mean_length": 6546.171875,
+      "completions/mean_terminated_length": 6146.259765625,
+      "completions/min_length": 839.0,
+      "completions/min_terminated_length": 839.0,
+      "entropy": 0.9217342138290405,
+      "epoch": 0.20791168353265868,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0017936143558472395,
+      "learning_rate": 1e-05,
+      "loss": 0.0748,
+      "num_tokens": 178444556.0,
+      "reward": 0.3984375,
+      "reward_std": 0.2909066081047058,
+      "rewards/accuracy_reward/mean": 0.3984375,
+      "rewards/accuracy_reward/std": 0.4915000796318054,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000327825546265,
+      "sampling/importance_sampling_ratio/min": 8.447741129202768e-05,
+      "sampling/sampling_logp_difference/max": 9.379026412963867,
+      "sampling/sampling_logp_difference/mean": 0.019764548167586327,
+      "step": 226
+    },
+    {
+      "clip_ratio/high_max": 2.1980493102091714e-05,
+      "clip_ratio/high_mean": 5.4951232755229285e-06,
+      "clip_ratio/low_mean": 4.3977801396977156e-05,
+      "clip_ratio/low_min": 7.912247156127705e-06,
+      "clip_ratio/region_mean": 4.947292427459615e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15707.0,
+      "completions/max_terminated_length": 15707.0,
+      "completions/mean_length": 6433.9296875,
+      "completions/mean_terminated_length": 6433.9296875,
+      "completions/min_length": 731.0,
+      "completions/min_terminated_length": 731.0,
+      "entropy": 0.9361409991979599,
+      "epoch": 0.20883164673413063,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0031324021983891726,
+      "learning_rate": 1e-05,
+      "loss": 0.0505,
+      "num_tokens": 179288499.0,
+      "reward": 0.453125,
+      "reward_std": 0.3066929578781128,
+      "rewards/accuracy_reward/mean": 0.453125,
+      "rewards/accuracy_reward/std": 0.4997538626194,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999454021453857,
+      "sampling/importance_sampling_ratio/min": 0.00018488657951820642,
+      "sampling/sampling_logp_difference/max": 8.595767974853516,
+      "sampling/sampling_logp_difference/mean": 0.019691072404384613,
+      "step": 227
+    },
+    {
+      "clip_ratio/high_max": 1.299416817346355e-05,
+      "clip_ratio/high_mean": 3.2485420433658874e-06,
+      "clip_ratio/low_mean": 3.756406420052372e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.081260635757644e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15787.0,
+      "completions/mean_length": 6037.75,
+      "completions/mean_terminated_length": 5873.52392578125,
+      "completions/min_length": 551.0,
+      "completions/min_terminated_length": 551.0,
+      "entropy": 0.8700985535979271,
+      "epoch": 0.20975160993560257,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0024714914616197348,
+      "learning_rate": 1e-05,
+      "loss": 0.0044,
+      "num_tokens": 180079619.0,
+      "reward": 0.484375,
+      "reward_std": 0.21436560153961182,
+      "rewards/accuracy_reward/mean": 0.484375,
+      "rewards/accuracy_reward/std": 0.5017194747924805,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999628067016602,
+      "sampling/importance_sampling_ratio/min": 8.4841696661897e-05,
+      "sampling/sampling_logp_difference/max": 9.374723434448242,
+      "sampling/sampling_logp_difference/mean": 0.018519341945648193,
+      "step": 228
+    },
+    {
+      "clip_ratio/high_max": 7.293307589861797e-06,
+      "clip_ratio/high_mean": 1.8233268974654493e-06,
+      "clip_ratio/low_mean": 2.2305866423266707e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.412919320704532e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 12264.0,
+      "completions/max_terminated_length": 12264.0,
+      "completions/mean_length": 5305.828125,
+      "completions/mean_terminated_length": 5305.828125,
+      "completions/min_length": 229.0,
+      "completions/min_terminated_length": 229.0,
+      "entropy": 1.1309608668088913,
+      "epoch": 0.21067157313707452,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.003593914210796356,
+      "learning_rate": 1e-05,
+      "loss": 0.0478,
+      "num_tokens": 180780877.0,
+      "reward": 0.3984375,
+      "reward_std": 0.24671241641044617,
+      "rewards/accuracy_reward/mean": 0.3984375,
+      "rewards/accuracy_reward/std": 0.4915000796318054,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.000011920928955,
+      "sampling/importance_sampling_ratio/min": 0.009941472671926022,
+      "sampling/sampling_logp_difference/max": 4.611040115356445,
+      "sampling/sampling_logp_difference/mean": 0.020471621304750443,
+      "step": 229
+    },
+    {
+      "clip_ratio/high_max": 2.0163415001661633e-05,
+      "clip_ratio/high_mean": 5.040853750415408e-06,
+      "clip_ratio/low_mean": 4.4980357415624894e-05,
+      "clip_ratio/low_min": 1.0012816346716136e-05,
+      "clip_ratio/region_mean": 5.0021211109196884e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 13814.0,
+      "completions/mean_length": 6022.96875,
+      "completions/mean_terminated_length": 5774.30419921875,
+      "completions/min_length": 556.0,
+      "completions/min_terminated_length": 556.0,
+      "entropy": 0.8560900762677193,
+      "epoch": 0.21159153633854647,
+      "frac_reward_zero_std": 0.125,
+      "grad_norm": 0.0029816587921231985,
+      "learning_rate": 1e-05,
+      "loss": 0.0913,
+      "num_tokens": 181571465.0,
+      "reward": 0.515625,
+      "reward_std": 0.41504397988319397,
+      "rewards/accuracy_reward/mean": 0.515625,
+      "rewards/accuracy_reward/std": 0.5017194747924805,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999518394470215,
+      "sampling/importance_sampling_ratio/min": 1.5958334188326262e-05,
+      "sampling/sampling_logp_difference/max": 11.04552936553955,
+      "sampling/sampling_logp_difference/mean": 0.0181986466050148,
+      "step": 230
+    },
+    {
+      "clip_ratio/high_max": 1.8430865566188004e-05,
+      "clip_ratio/high_mean": 6.177042905619601e-06,
+      "clip_ratio/low_mean": 4.450247388376738e-05,
+      "clip_ratio/low_min": 4.840271230932558e-06,
+      "clip_ratio/region_mean": 5.067951724413433e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15130.0,
+      "completions/max_terminated_length": 15130.0,
+      "completions/mean_length": 6647.71875,
+      "completions/mean_terminated_length": 6647.71875,
+      "completions/min_length": 196.0,
+      "completions/min_terminated_length": 196.0,
+      "entropy": 0.9455481320619583,
+      "epoch": 0.2125114995400184,
+      "frac_reward_zero_std": 0.125,
+      "grad_norm": 0.0031632622703909874,
+      "learning_rate": 1e-05,
+      "loss": 0.1317,
+      "num_tokens": 182440957.0,
+      "reward": 0.3828125,
+      "reward_std": 0.39902517199516296,
+      "rewards/accuracy_reward/mean": 0.3828125,
+      "rewards/accuracy_reward/std": 0.4879830479621887,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000306367874146,
+      "sampling/importance_sampling_ratio/min": 1.4739508515049238e-05,
+      "sampling/sampling_logp_difference/max": 11.124979019165039,
+      "sampling/sampling_logp_difference/mean": 0.01906408555805683,
+      "step": 231
+    },
+    {
+      "clip_ratio/high_max": 2.2937053017813014e-05,
+      "clip_ratio/high_mean": 5.7342632544532535e-06,
+      "clip_ratio/low_mean": 6.042617155799235e-05,
+      "clip_ratio/low_min": 1.1000354334100848e-05,
+      "clip_ratio/region_mean": 6.616043401663774e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15988.0,
+      "completions/mean_length": 6809.1640625,
+      "completions/mean_terminated_length": 6500.29833984375,
+      "completions/min_length": 471.0,
+      "completions/min_terminated_length": 471.0,
+      "entropy": 1.050546184182167,
+      "epoch": 0.21343146274149033,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.00162694591563195,
+      "learning_rate": 1e-05,
+      "loss": 0.0346,
+      "num_tokens": 183332242.0,
+      "reward": 0.421875,
+      "reward_std": 0.33616161346435547,
+      "rewards/accuracy_reward/mean": 0.421875,
+      "rewards/accuracy_reward/std": 0.4957992732524872,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000290870666504,
+      "sampling/importance_sampling_ratio/min": 4.244970114086755e-06,
+      "sampling/sampling_logp_difference/max": 12.369775772094727,
+      "sampling/sampling_logp_difference/mean": 0.021866722032427788,
+      "step": 232
+    },
+    {
+      "clip_ratio/high_max": 1.4678411844215589e-05,
+      "clip_ratio/high_mean": 3.669602961053897e-06,
+      "clip_ratio/low_mean": 2.4373607971028832e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.8043211159456405e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16348.0,
+      "completions/mean_length": 6815.5,
+      "completions/mean_terminated_length": 6506.83837890625,
+      "completions/min_length": 51.0,
+      "completions/min_terminated_length": 51.0,
+      "entropy": 1.060033954679966,
+      "epoch": 0.21435142594296228,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0024887355975806713,
+      "learning_rate": 1e-05,
+      "loss": 0.1059,
+      "num_tokens": 184225138.0,
+      "reward": 0.328125,
+      "reward_std": 0.2869548499584198,
+      "rewards/accuracy_reward/mean": 0.328125,
+      "rewards/accuracy_reward/std": 0.4713755249977112,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999393820762634,
+      "sampling/importance_sampling_ratio/min": 0.00012930770753882825,
+      "sampling/sampling_logp_difference/max": 8.953315734863281,
+      "sampling/sampling_logp_difference/mean": 0.02019432932138443,
+      "step": 233
+    },
+    {
+      "clip_ratio/high_max": 7.910891326901037e-06,
+      "clip_ratio/high_mean": 1.9777228317252593e-06,
+      "clip_ratio/low_mean": 3.8802519611635944e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.078024221598753e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15838.0,
+      "completions/mean_length": 6928.4453125,
+      "completions/mean_terminated_length": 6623.42724609375,
+      "completions/min_length": 304.0,
+      "completions/min_terminated_length": 304.0,
+      "entropy": 0.9051575735211372,
+      "epoch": 0.21527138914443422,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.002783838426694274,
+      "learning_rate": 1e-05,
+      "loss": 0.0624,
+      "num_tokens": 185136323.0,
+      "reward": 0.3359375,
+      "reward_std": 0.25460803508758545,
+      "rewards/accuracy_reward/mean": 0.3359375,
+      "rewards/accuracy_reward/std": 0.47417303919792175,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999524354934692,
+      "sampling/importance_sampling_ratio/min": 1.0146355634788051e-05,
+      "sampling/sampling_logp_difference/max": 11.498395919799805,
+      "sampling/sampling_logp_difference/mean": 0.01905050128698349,
+      "step": 234
+    },
+    {
+      "clip_ratio/high_max": 4.399394583742833e-06,
+      "clip_ratio/high_mean": 1.0998486459357082e-06,
+      "clip_ratio/low_mean": 1.733424267058581e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 1.8434091430208355e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14703.0,
+      "completions/mean_length": 7155.1328125,
+      "completions/mean_terminated_length": 7082.46435546875,
+      "completions/min_length": 111.0,
+      "completions/min_terminated_length": 111.0,
+      "entropy": 1.0119014978408813,
+      "epoch": 0.21619135234590617,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.002105508930981159,
+      "learning_rate": 1e-05,
+      "loss": 0.0655,
+      "num_tokens": 186071324.0,
+      "reward": 0.328125,
+      "reward_std": 0.26303553581237793,
+      "rewards/accuracy_reward/mean": 0.328125,
+      "rewards/accuracy_reward/std": 0.4713755249977112,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999904990196228,
+      "sampling/importance_sampling_ratio/min": 0.003494206117466092,
+      "sampling/sampling_logp_difference/max": 5.656649112701416,
+      "sampling/sampling_logp_difference/mean": 0.020860780030488968,
+      "step": 235
+    },
+    {
+      "clip_ratio/high_max": 1.0561529961705673e-05,
+      "clip_ratio/high_mean": 3.4390433256703545e-06,
+      "clip_ratio/low_mean": 2.8499469067355676e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.193851205196552e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 16176.0,
+      "completions/max_terminated_length": 16176.0,
+      "completions/mean_length": 7463.2421875,
+      "completions/mean_terminated_length": 7463.2421875,
+      "completions/min_length": 698.0,
+      "completions/min_terminated_length": 698.0,
+      "entropy": 0.9983502700924873,
+      "epoch": 0.21711131554737811,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0013582308311015368,
+      "learning_rate": 1e-05,
+      "loss": 0.048,
+      "num_tokens": 187045035.0,
+      "reward": 0.3984375,
+      "reward_std": 0.2517249584197998,
+      "rewards/accuracy_reward/mean": 0.3984375,
+      "rewards/accuracy_reward/std": 0.4915000796318054,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999428987503052,
+      "sampling/importance_sampling_ratio/min": 0.000473080639494583,
+      "sampling/sampling_logp_difference/max": 7.65624475479126,
+      "sampling/sampling_logp_difference/mean": 0.021131811663508415,
+      "step": 236
+    },
+    {
+      "clip_ratio/high_max": 8.509013468938065e-06,
+      "clip_ratio/high_mean": 2.127253367234516e-06,
+      "clip_ratio/low_mean": 3.985050443588989e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.197775751890731e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14938.0,
+      "completions/mean_length": 6460.984375,
+      "completions/mean_terminated_length": 6382.8505859375,
+      "completions/min_length": 1747.0,
+      "completions/min_terminated_length": 1747.0,
+      "entropy": 0.7869217246770859,
+      "epoch": 0.21803127874885003,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.002681629965081811,
+      "learning_rate": 1e-05,
+      "loss": 0.0987,
+      "num_tokens": 187889609.0,
+      "reward": 0.5234375,
+      "reward_std": 0.39082521200180054,
+      "rewards/accuracy_reward/mean": 0.5234375,
+      "rewards/accuracy_reward/std": 0.5014128684997559,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999568462371826,
+      "sampling/importance_sampling_ratio/min": 0.0015037209959700704,
+      "sampling/sampling_logp_difference/max": 6.499812602996826,
+      "sampling/sampling_logp_difference/mean": 0.016937749460339546,
+      "step": 237
+    },
+    {
+      "clip_ratio/high_max": 1.2362176221358823e-05,
+      "clip_ratio/high_mean": 3.0905440553397057e-06,
+      "clip_ratio/low_mean": 5.0333514764133724e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.342405825103924e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15893.0,
+      "completions/mean_length": 6241.78125,
+      "completions/mean_terminated_length": 6161.92138671875,
+      "completions/min_length": 548.0,
+      "completions/min_terminated_length": 548.0,
+      "entropy": 1.0217387825250626,
+      "epoch": 0.21895124195032198,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0021239183843135834,
+      "learning_rate": 1e-05,
+      "loss": 0.0353,
+      "num_tokens": 188706605.0,
+      "reward": 0.2578125,
+      "reward_std": 0.3135277330875397,
+      "rewards/accuracy_reward/mean": 0.2578125,
+      "rewards/accuracy_reward/std": 0.43914902210235596,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999796748161316,
+      "sampling/importance_sampling_ratio/min": 0.004853047896176577,
+      "sampling/sampling_logp_difference/max": 5.328148365020752,
+      "sampling/sampling_logp_difference/mean": 0.02103862166404724,
+      "step": 238
+    },
+    {
+      "clip_ratio/high_max": 6.725130333506968e-06,
+      "clip_ratio/high_mean": 1.681282583376742e-06,
+      "clip_ratio/low_mean": 3.437372129155847e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.605500387493521e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15332.0,
+      "completions/mean_length": 5638.1328125,
+      "completions/mean_terminated_length": 5553.51953125,
+      "completions/min_length": 66.0,
+      "completions/min_terminated_length": 66.0,
+      "entropy": 0.7844365313649178,
+      "epoch": 0.21987120515179392,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0023868419229984283,
+      "learning_rate": 1e-05,
+      "loss": 0.0458,
+      "num_tokens": 189446294.0,
+      "reward": 0.515625,
+      "reward_std": 0.31246688961982727,
+      "rewards/accuracy_reward/mean": 0.515625,
+      "rewards/accuracy_reward/std": 0.5017194747924805,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000369548797607,
+      "sampling/importance_sampling_ratio/min": 0.0008047468145377934,
+      "sampling/sampling_logp_difference/max": 7.124982833862305,
+      "sampling/sampling_logp_difference/mean": 0.017401430755853653,
+      "step": 239
+    },
+    {
+      "clip_ratio/high_max": 2.887730215661577e-05,
+      "clip_ratio/high_mean": 7.219325539153942e-06,
+      "clip_ratio/low_mean": 2.826443028425274e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.548375502759882e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16196.0,
+      "completions/mean_length": 6374.8046875,
+      "completions/mean_terminated_length": 6215.9287109375,
+      "completions/min_length": 722.0,
+      "completions/min_terminated_length": 722.0,
+      "entropy": 0.9472770467400551,
+      "epoch": 0.22079116835326587,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0027549315709620714,
+      "learning_rate": 1e-05,
+      "loss": 0.0627,
+      "num_tokens": 190281461.0,
+      "reward": 0.3984375,
+      "reward_std": 0.3167053163051605,
+      "rewards/accuracy_reward/mean": 0.3984375,
+      "rewards/accuracy_reward/std": 0.4915000796318054,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998682737350464,
+      "sampling/importance_sampling_ratio/min": 7.100860239006579e-05,
+      "sampling/sampling_logp_difference/max": 9.552709579467773,
+      "sampling/sampling_logp_difference/mean": 0.020243138074874878,
+      "step": 240
+    },
+    {
+      "clip_ratio/high_max": 1.586787766427733e-05,
+      "clip_ratio/high_mean": 3.9669694160693325e-06,
+      "clip_ratio/low_mean": 2.978218674343225e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.374915604581474e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15097.0,
+      "completions/mean_length": 6654.21875,
+      "completions/mean_terminated_length": 6499.88134765625,
+      "completions/min_length": 246.0,
+      "completions/min_terminated_length": 246.0,
+      "entropy": 1.0028243213891983,
+      "epoch": 0.22171113155473782,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0013344973558560014,
+      "learning_rate": 1e-05,
+      "loss": 0.0184,
+      "num_tokens": 191156249.0,
+      "reward": 0.359375,
+      "reward_std": 0.22832971811294556,
+      "rewards/accuracy_reward/mean": 0.359375,
+      "rewards/accuracy_reward/std": 0.481702595949173,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999619722366333,
+      "sampling/importance_sampling_ratio/min": 0.0021875568199902773,
+      "sampling/sampling_logp_difference/max": 6.124969959259033,
+      "sampling/sampling_logp_difference/mean": 0.020470600575208664,
+      "step": 241
+    },
+    {
+      "clip_ratio/high_max": 1.681529829511419e-05,
+      "clip_ratio/high_mean": 4.9954849146160996e-06,
+      "clip_ratio/low_mean": 2.040554932136729e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.5401033553862362e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16172.0,
+      "completions/mean_length": 6767.7890625,
+      "completions/mean_terminated_length": 6537.00048828125,
+      "completions/min_length": 132.0,
+      "completions/min_terminated_length": 132.0,
+      "entropy": 0.9059296399354935,
+      "epoch": 0.22263109475620976,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0016136945923790336,
+      "learning_rate": 1e-05,
+      "loss": 0.0816,
+      "num_tokens": 192040526.0,
+      "reward": 0.4921875,
+      "reward_std": 0.2909066081047058,
+      "rewards/accuracy_reward/mean": 0.4921875,
+      "rewards/accuracy_reward/std": 0.5019033551216125,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999668598175049,
+      "sampling/importance_sampling_ratio/min": 1.2452921509975567e-05,
+      "sampling/sampling_logp_difference/max": 11.29355525970459,
+      "sampling/sampling_logp_difference/mean": 0.020058143883943558,
+      "step": 242
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 2.9821966563758906e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.9821966563758906e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 16275.0,
+      "completions/max_terminated_length": 16275.0,
+      "completions/mean_length": 6767.4921875,
+      "completions/mean_terminated_length": 6767.4921875,
+      "completions/min_length": 998.0,
+      "completions/min_terminated_length": 998.0,
+      "entropy": 1.0446822568774223,
+      "epoch": 0.22355105795768168,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.002869367366656661,
+      "learning_rate": 1e-05,
+      "loss": 0.0212,
+      "num_tokens": 192926469.0,
+      "reward": 0.3828125,
+      "reward_std": 0.2517249882221222,
+      "rewards/accuracy_reward/mean": 0.3828125,
+      "rewards/accuracy_reward/std": 0.4879830479621887,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999586343765259,
+      "sampling/importance_sampling_ratio/min": 1.9328599591972306e-05,
+      "sampling/sampling_logp_difference/max": 10.853924751281738,
+      "sampling/sampling_logp_difference/mean": 0.021512050181627274,
+      "step": 243
+    },
+    {
+      "clip_ratio/high_max": 3.44581130775623e-05,
+      "clip_ratio/high_mean": 1.3001711295146379e-05,
+      "clip_ratio/low_mean": 3.6407937841431703e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.940964981869911e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 16261.0,
+      "completions/max_terminated_length": 16261.0,
+      "completions/mean_length": 5738.484375,
+      "completions/mean_terminated_length": 5738.484375,
+      "completions/min_length": 184.0,
+      "completions/min_terminated_length": 184.0,
+      "entropy": 0.8617956340312958,
+      "epoch": 0.22447102115915363,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.002177527640014887,
+      "learning_rate": 1e-05,
+      "loss": -0.0189,
+      "num_tokens": 193678859.0,
+      "reward": 0.5546875,
+      "reward_std": 0.33220988512039185,
+      "rewards/accuracy_reward/mean": 0.5546875,
+      "rewards/accuracy_reward/std": 0.4989531338214874,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999570846557617,
+      "sampling/importance_sampling_ratio/min": 0.0008533780346624553,
+      "sampling/sampling_logp_difference/max": 7.06630802154541,
+      "sampling/sampling_logp_difference/mean": 0.018141131848096848,
+      "step": 244
+    },
+    {
+      "clip_ratio/high_max": 3.861003733618418e-06,
+      "clip_ratio/high_mean": 9.652509334046044e-07,
+      "clip_ratio/low_mean": 2.7767115511778684e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.8732366558870126e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15595.0,
+      "completions/mean_length": 6382.90625,
+      "completions/mean_terminated_length": 5976.357421875,
+      "completions/min_length": 464.0,
+      "completions/min_terminated_length": 464.0,
+      "entropy": 0.8692388981580734,
+      "epoch": 0.22539098436062557,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.004127771593630314,
+      "learning_rate": 1e-05,
+      "loss": 0.0572,
+      "num_tokens": 194511847.0,
+      "reward": 0.4140625,
+      "reward_std": 0.2767002582550049,
+      "rewards/accuracy_reward/mean": 0.4140625,
+      "rewards/accuracy_reward/std": 0.49449479579925537,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998810291290283,
+      "sampling/importance_sampling_ratio/min": 5.4239239943854045e-06,
+      "sampling/sampling_logp_difference/max": 12.124691009521484,
+      "sampling/sampling_logp_difference/mean": 0.018376430496573448,
+      "step": 245
+    },
+    {
+      "clip_ratio/high_max": 9.728395525598899e-06,
+      "clip_ratio/high_mean": 2.4320988813997246e-06,
+      "clip_ratio/low_mean": 5.3631663831765763e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.606376271316549e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 14504.0,
+      "completions/max_terminated_length": 14504.0,
+      "completions/mean_length": 5776.15625,
+      "completions/mean_terminated_length": 5776.15625,
+      "completions/min_length": 1018.0,
+      "completions/min_terminated_length": 1018.0,
+      "entropy": 1.1195004731416702,
+      "epoch": 0.22631094756209752,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.00263008801266551,
+      "learning_rate": 1e-05,
+      "loss": 0.0687,
+      "num_tokens": 195270051.0,
+      "reward": 0.421875,
+      "reward_std": 0.3618982434272766,
+      "rewards/accuracy_reward/mean": 0.421875,
+      "rewards/accuracy_reward/std": 0.4957992732524872,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999971866607666,
+      "sampling/importance_sampling_ratio/min": 0.005209421273320913,
+      "sampling/sampling_logp_difference/max": 5.257286548614502,
+      "sampling/sampling_logp_difference/mean": 0.019923292100429535,
+      "step": 246
+    },
+    {
+      "clip_ratio/high_max": 1.2701100786216557e-05,
+      "clip_ratio/high_mean": 3.1752751965541393e-06,
+      "clip_ratio/low_mean": 4.2162768181697174e-05,
+      "clip_ratio/low_min": 3.873926743835909e-06,
+      "clip_ratio/region_mean": 4.5338043378251314e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15203.0,
+      "completions/mean_length": 7411.421875,
+      "completions/mean_terminated_length": 7196.08056640625,
+      "completions/min_length": 455.0,
+      "completions/min_terminated_length": 455.0,
+      "entropy": 0.9801053553819656,
+      "epoch": 0.22723091076356947,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.002642859937623143,
+      "learning_rate": 1e-05,
+      "loss": 0.07,
+      "num_tokens": 196240913.0,
+      "reward": 0.390625,
+      "reward_std": 0.27328529953956604,
+      "rewards/accuracy_reward/mean": 0.390625,
+      "rewards/accuracy_reward/std": 0.4898075461387634,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999198913574219,
+      "sampling/importance_sampling_ratio/min": 0.00017500204558018595,
+      "sampling/sampling_logp_difference/max": 8.650712966918945,
+      "sampling/sampling_logp_difference/mean": 0.021511007100343704,
+      "step": 247
+    },
+    {
+      "clip_ratio/high_max": 1.5122936929401476e-05,
+      "clip_ratio/high_mean": 3.780734232350369e-06,
+      "clip_ratio/low_mean": 6.367217611114029e-05,
+      "clip_ratio/low_min": 4.8010447244450916e-06,
+      "clip_ratio/region_mean": 6.745291057086433e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16127.0,
+      "completions/mean_length": 7944.65625,
+      "completions/mean_terminated_length": 7742.1123046875,
+      "completions/min_length": 144.0,
+      "completions/min_terminated_length": 144.0,
+      "entropy": 1.0132562816143036,
+      "epoch": 0.2281508739650414,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.002439325675368309,
+      "learning_rate": 1e-05,
+      "loss": 0.0564,
+      "num_tokens": 197278517.0,
+      "reward": 0.34375,
+      "reward_std": 0.3161812424659729,
+      "rewards/accuracy_reward/mean": 0.34375,
+      "rewards/accuracy_reward/std": 0.47682511806488037,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999248385429382,
+      "sampling/importance_sampling_ratio/min": 1.0140610356756952e-05,
+      "sampling/sampling_logp_difference/max": 11.49896240234375,
+      "sampling/sampling_logp_difference/mean": 0.02124868705868721,
+      "step": 248
+    },
+    {
+      "clip_ratio/high_max": 2.6017536356448545e-05,
+      "clip_ratio/high_mean": 6.504384089112136e-06,
+      "clip_ratio/low_mean": 3.7791321346958284e-05,
+      "clip_ratio/low_min": 3.2110563097376144e-06,
+      "clip_ratio/region_mean": 4.429570503816649e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16317.0,
+      "completions/mean_length": 7550.0,
+      "completions/mean_terminated_length": 7409.7783203125,
+      "completions/min_length": 1469.0,
+      "completions/min_terminated_length": 1469.0,
+      "entropy": 1.0384011715650558,
+      "epoch": 0.22907083716651333,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0014879995724186301,
+      "learning_rate": 1e-05,
+      "loss": 0.0338,
+      "num_tokens": 198265589.0,
+      "reward": 0.3359375,
+      "reward_std": 0.24040167033672333,
+      "rewards/accuracy_reward/mean": 0.3359375,
+      "rewards/accuracy_reward/std": 0.47417303919792175,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999468922615051,
+      "sampling/importance_sampling_ratio/min": 8.418659126618877e-05,
+      "sampling/sampling_logp_difference/max": 9.382474899291992,
+      "sampling/sampling_logp_difference/mean": 0.021503347903490067,
+      "step": 249
+    },
+    {
+      "clip_ratio/high_max": 1.3615457191917812e-05,
+      "clip_ratio/high_mean": 4.491880531531933e-06,
+      "clip_ratio/low_mean": 3.916533574965797e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.365721684962409e-05,
+      "completions/clipped_ratio": 0.0703125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16221.0,
+      "completions/mean_length": 8140.9140625,
+      "completions/mean_terminated_length": 7517.48779296875,
+      "completions/min_length": 837.0,
+      "completions/min_terminated_length": 837.0,
+      "entropy": 0.8718572407960892,
+      "epoch": 0.22999080036798528,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.002340668346732855,
+      "learning_rate": 1e-05,
+      "loss": 0.0585,
+      "num_tokens": 199324938.0,
+      "reward": 0.453125,
+      "reward_std": 0.35824596881866455,
+      "rewards/accuracy_reward/mean": 0.453125,
+      "rewards/accuracy_reward/std": 0.4997538626194,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999454021453857,
+      "sampling/importance_sampling_ratio/min": 0.002325017238035798,
+      "sampling/sampling_logp_difference/max": 6.064027786254883,
+      "sampling/sampling_logp_difference/mean": 0.019466478377580643,
+      "step": 250
+    },
+    {
+      "clip_ratio/high_max": 2.2175697040438536e-05,
+      "clip_ratio/high_mean": 5.543924260109634e-06,
+      "clip_ratio/low_mean": 4.1318608055007644e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.686253225827386e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16263.0,
+      "completions/mean_length": 6630.96875,
+      "completions/mean_terminated_length": 6396.896484375,
+      "completions/min_length": 116.0,
+      "completions/min_terminated_length": 116.0,
+      "entropy": 0.7798146530985832,
+      "epoch": 0.23091076356945722,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.001989356242120266,
+      "learning_rate": 1e-05,
+      "loss": 0.0218,
+      "num_tokens": 200189902.0,
+      "reward": 0.5625,
+      "reward_std": 0.2987973093986511,
+      "rewards/accuracy_reward/mean": 0.5625,
+      "rewards/accuracy_reward/std": 0.49802759289741516,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999474883079529,
+      "sampling/importance_sampling_ratio/min": 0.0003315774374641478,
+      "sampling/sampling_logp_difference/max": 8.011649131774902,
+      "sampling/sampling_logp_difference/mean": 0.01849902793765068,
+      "step": 251
+    },
+    {
+      "clip_ratio/high_max": 3.325706302348408e-06,
+      "clip_ratio/high_mean": 8.31426575587102e-07,
+      "clip_ratio/low_mean": 2.0285911205064622e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.111733795118198e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15357.0,
+      "completions/max_terminated_length": 15357.0,
+      "completions/mean_length": 6582.203125,
+      "completions/mean_terminated_length": 6582.203125,
+      "completions/min_length": 593.0,
+      "completions/min_terminated_length": 593.0,
+      "entropy": 1.0181676000356674,
+      "epoch": 0.23183072677092917,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.002594445599243045,
+      "learning_rate": 1e-05,
+      "loss": 0.0232,
+      "num_tokens": 201052832.0,
+      "reward": 0.34375,
+      "reward_std": 0.25460314750671387,
+      "rewards/accuracy_reward/mean": 0.34375,
+      "rewards/accuracy_reward/std": 0.47682511806488037,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999495148658752,
+      "sampling/importance_sampling_ratio/min": 0.0003853558446280658,
+      "sampling/sampling_logp_difference/max": 7.8613433837890625,
+      "sampling/sampling_logp_difference/mean": 0.021598614752292633,
+      "step": 252
+    },
+    {
+      "clip_ratio/high_max": 2.2044430352252675e-05,
+      "clip_ratio/high_mean": 5.511107588063169e-06,
+      "clip_ratio/low_mean": 3.4155824209847196e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.96669319115972e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 14540.0,
+      "completions/max_terminated_length": 14540.0,
+      "completions/mean_length": 6145.1796875,
+      "completions/mean_terminated_length": 6145.1796875,
+      "completions/min_length": 1098.0,
+      "completions/min_terminated_length": 1098.0,
+      "entropy": 0.9084350541234016,
+      "epoch": 0.23275068997240111,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.003104996867477894,
+      "learning_rate": 1e-05,
+      "loss": 0.0811,
+      "num_tokens": 201858047.0,
+      "reward": 0.5078125,
+      "reward_std": 0.33220985531806946,
+      "rewards/accuracy_reward/mean": 0.5078125,
+      "rewards/accuracy_reward/std": 0.5019033551216125,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.000011682510376,
+      "sampling/importance_sampling_ratio/min": 0.007650630082935095,
+      "sampling/sampling_logp_difference/max": 4.87296724319458,
+      "sampling/sampling_logp_difference/mean": 0.018979094922542572,
+      "step": 253
+    },
+    {
+      "clip_ratio/high_max": 2.9959978519400465e-05,
+      "clip_ratio/high_mean": 7.489994629850116e-06,
+      "clip_ratio/low_mean": 3.5255963325653283e-05,
+      "clip_ratio/low_min": 2.973075879708631e-06,
+      "clip_ratio/region_mean": 4.274595892184152e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15745.0,
+      "completions/max_terminated_length": 15745.0,
+      "completions/mean_length": 7259.953125,
+      "completions/mean_terminated_length": 7259.953125,
+      "completions/min_length": 960.0,
+      "completions/min_terminated_length": 960.0,
+      "entropy": 0.9823614731431007,
+      "epoch": 0.23367065317387303,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.003212577663362026,
+      "learning_rate": 1e-05,
+      "loss": 0.0133,
+      "num_tokens": 202807673.0,
+      "reward": 0.4765625,
+      "reward_std": 0.3056321144104004,
+      "rewards/accuracy_reward/mean": 0.4765625,
+      "rewards/accuracy_reward/std": 0.5014128684997559,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999860405921936,
+      "sampling/importance_sampling_ratio/min": 0.000536504783667624,
+      "sampling/sampling_logp_difference/max": 7.530435085296631,
+      "sampling/sampling_logp_difference/mean": 0.021432969719171524,
+      "step": 254
+    },
+    {
+      "clip_ratio/high_max": 3.273996276220714e-05,
+      "clip_ratio/high_mean": 9.095591565255745e-06,
+      "clip_ratio/low_mean": 2.9539680099333054e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.8635271948805894e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16369.0,
+      "completions/mean_length": 7258.71875,
+      "completions/mean_terminated_length": 7113.87353515625,
+      "completions/min_length": 4.0,
+      "completions/min_terminated_length": 4.0,
+      "entropy": 0.8823810070753098,
+      "epoch": 0.23459061637534498,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.001418307889252901,
+      "learning_rate": 1e-05,
+      "loss": 0.0411,
+      "num_tokens": 203757333.0,
+      "reward": 0.40625,
+      "reward_std": 0.3048579692840576,
+      "rewards/accuracy_reward/mean": 0.40625,
+      "rewards/accuracy_reward/std": 0.4930621087551117,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999884963035583,
+      "sampling/importance_sampling_ratio/min": 0.0006408974295482039,
+      "sampling/sampling_logp_difference/max": 7.3526411056518555,
+      "sampling/sampling_logp_difference/mean": 0.019296500831842422,
+      "step": 255
+    },
+    {
+      "clip_ratio/high_max": 1.544119368190877e-05,
+      "clip_ratio/high_mean": 3.860298420477193e-06,
+      "clip_ratio/low_mean": 3.755458698151415e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.141488631148604e-05,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15605.0,
+      "completions/mean_length": 7011.40625,
+      "completions/mean_terminated_length": 6386.56689453125,
+      "completions/min_length": 685.0,
+      "completions/min_terminated_length": 685.0,
+      "entropy": 0.8057166337966919,
+      "epoch": 0.23551057957681693,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.001652427832596004,
+      "learning_rate": 1e-05,
+      "loss": 0.0459,
+      "num_tokens": 204675065.0,
+      "reward": 0.46875,
+      "reward_std": 0.24146251380443573,
+      "rewards/accuracy_reward/mean": 0.46875,
+      "rewards/accuracy_reward/std": 0.5009832978248596,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999918937683105,
+      "sampling/importance_sampling_ratio/min": 0.015319154597818851,
+      "sampling/sampling_logp_difference/max": 4.178651332855225,
+      "sampling/sampling_logp_difference/mean": 0.018787402659654617,
+      "step": 256
+    }
+  ],
+  "logging_steps": 1,
+  "max_steps": 1024,
+  "num_input_tokens_seen": 204675065,
+  "num_train_epochs": 1,
+  "save_steps": 64,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 0.0,
+  "train_batch_size": 4,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/dapo_lora_plus_20251202_001141/checkpoint-256/zero_to_fp32.py b/dapo_lora_plus_20251202_001141/checkpoint-256/zero_to_fp32.py
new file mode 100644
index 0000000000000000000000000000000000000000..5995d6e6f04e43b989587aa9022a3aef0c66d694
--- /dev/null
+++ b/dapo_lora_plus_20251202_001141/checkpoint-256/zero_to_fp32.py
@@ -0,0 +1,760 @@
+#!/usr/bin/env python
+
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets
+# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in
+# the future. Once extracted, the weights don't require DeepSpeed and can be used in any
+# application.
+#
+# example:
+#   python zero_to_fp32.py . output_dir/
+#   or
+#   python zero_to_fp32.py . output_dir/ --safe_serialization
+
+import argparse
+import torch
+import glob
+import math
+import os
+import re
+import gc
+import json
+import numpy as np
+from tqdm import tqdm
+from collections import OrderedDict
+from dataclasses import dataclass
+
+# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with
+# DeepSpeed data structures it has to be available in the current python environment.
+from deepspeed.utils import logger
+from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS,
+                                            FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES,
+                                            FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS)
+
+
+@dataclass
+class zero_model_state:
+    buffers: dict()
+    param_shapes: dict()
+    shared_params: list
+    ds_version: int
+    frozen_param_shapes: dict()
+    frozen_param_fragments: dict()
+
+
+debug = 0
+
+# load to cpu
+device = torch.device('cpu')
+
+
+def atoi(text):
+    return int(text) if text.isdigit() else text
+
+
+def natural_keys(text):
+    '''
+    alist.sort(key=natural_keys) sorts in human order
+    http://nedbatchelder.com/blog/200712/human_sorting.html
+    (See Toothy's implementation in the comments)
+    '''
+    return [atoi(c) for c in re.split(r'(\d+)', text)]
+
+
+def get_model_state_file(checkpoint_dir, zero_stage):
+    if not os.path.isdir(checkpoint_dir):
+        raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist")
+
+    # there should be only one file
+    if zero_stage <= 2:
+        file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt")
+    elif zero_stage == 3:
+        file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt")
+
+    if not os.path.exists(file):
+        raise FileNotFoundError(f"can't find model states file at '{file}'")
+
+    return file
+
+
+def get_checkpoint_files(checkpoint_dir, glob_pattern):
+    # XXX: need to test that this simple glob rule works for multi-node setup too
+    ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys)
+
+    if len(ckpt_files) == 0:
+        raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'")
+
+    return ckpt_files
+
+
+def get_optim_files(checkpoint_dir):
+    return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt")
+
+
+def get_model_state_files(checkpoint_dir):
+    return get_checkpoint_files(checkpoint_dir, "*_model_states.pt")
+
+
+def parse_model_states(files):
+    zero_model_states = []
+    for file in files:
+        state_dict = torch.load(file, map_location=device, weights_only=False)
+
+        if BUFFER_NAMES not in state_dict:
+            raise ValueError(f"{file} is not a model state checkpoint")
+        buffer_names = state_dict[BUFFER_NAMES]
+        if debug:
+            print("Found buffers:", buffer_names)
+
+        # recover just the buffers while restoring them to fp32 if they were saved in fp16
+        buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names}
+        param_shapes = state_dict[PARAM_SHAPES]
+
+        # collect parameters that are included in param_shapes
+        param_names = []
+        for s in param_shapes:
+            for name in s.keys():
+                param_names.append(name)
+
+        # update with frozen parameters
+        frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None)
+        if frozen_param_shapes is not None:
+            if debug:
+                print(f"Found frozen_param_shapes: {frozen_param_shapes}")
+            param_names += list(frozen_param_shapes.keys())
+
+        # handle shared params
+        shared_params = [[k, v] for k, v in state_dict["shared_params"].items()]
+
+        ds_version = state_dict.get(DS_VERSION, None)
+
+        frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None)
+
+        z_model_state = zero_model_state(buffers=buffers,
+                                         param_shapes=param_shapes,
+                                         shared_params=shared_params,
+                                         ds_version=ds_version,
+                                         frozen_param_shapes=frozen_param_shapes,
+                                         frozen_param_fragments=frozen_param_fragments)
+        zero_model_states.append(z_model_state)
+
+    return zero_model_states
+
+
+def parse_optim_states(files, ds_checkpoint_dir):
+    total_files = len(files)
+    state_dicts = []
+    for f in tqdm(files, desc='Loading checkpoint shards'):
+        state_dict = torch.load(f, map_location=device, mmap=True, weights_only=False)
+        # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights
+        # and also handle the case where it was already removed by another helper script
+        state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None)
+        state_dicts.append(state_dict)
+
+    if ZERO_STAGE not in state_dicts[0][OPTIMIZER_STATE_DICT]:
+        raise ValueError(f"{files[0]} is not a zero checkpoint")
+    zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE]
+    world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT]
+
+    # For ZeRO-2 each param group can have different partition_count as data parallelism for expert
+    # parameters can be different from data parallelism for non-expert parameters. So we can just
+    # use the max of the partition_count to get the dp world_size.
+
+    if type(world_size) is list:
+        world_size = max(world_size)
+
+    if world_size != total_files:
+        raise ValueError(
+            f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. "
+            "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes."
+        )
+
+    # the groups are named differently in each stage
+    if zero_stage <= 2:
+        fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS
+    elif zero_stage == 3:
+        fp32_groups_key = FP32_FLAT_GROUPS
+    else:
+        raise ValueError(f"unknown zero stage {zero_stage}")
+
+    fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))]
+    return zero_stage, world_size, fp32_flat_groups
+
+
+def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters):
+    """
+    Returns fp32 state_dict reconstructed from ds checkpoint
+
+    Args:
+        - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are)
+
+    """
+    print(f"Processing zero checkpoint '{ds_checkpoint_dir}'")
+
+    optim_files = get_optim_files(ds_checkpoint_dir)
+    zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir)
+    print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}")
+
+    model_files = get_model_state_files(ds_checkpoint_dir)
+
+    zero_model_states = parse_model_states(model_files)
+    print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}')
+
+    if zero_stage <= 2:
+        return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+                                                          exclude_frozen_parameters)
+    elif zero_stage == 3:
+        return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+                                                          exclude_frozen_parameters)
+
+
+def _zero2_merge_frozen_params(state_dict, zero_model_states):
+    if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0:
+        return
+
+    frozen_param_shapes = zero_model_states[0].frozen_param_shapes
+    frozen_param_fragments = zero_model_states[0].frozen_param_fragments
+
+    if debug:
+        num_elem = sum(s.numel() for s in frozen_param_shapes.values())
+        print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}')
+
+        wanted_params = len(frozen_param_shapes)
+        wanted_numel = sum(s.numel() for s in frozen_param_shapes.values())
+        avail_numel = sum([p.numel() for p in frozen_param_fragments.values()])
+        print(f'Frozen params: Have {avail_numel} numels to process.')
+        print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params')
+
+    total_params = 0
+    total_numel = 0
+    for name, shape in frozen_param_shapes.items():
+        total_params += 1
+        unpartitioned_numel = shape.numel()
+        total_numel += unpartitioned_numel
+
+        state_dict[name] = frozen_param_fragments[name]
+
+        if debug:
+            print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ")
+
+    print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements")
+
+
+def _has_callable(obj, fn):
+    attr = getattr(obj, fn, None)
+    return callable(attr)
+
+
+def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states):
+    param_shapes = zero_model_states[0].param_shapes
+
+    # Reconstruction protocol:
+    #
+    # XXX: document this
+
+    if debug:
+        for i in range(world_size):
+            for j in range(len(fp32_flat_groups[0])):
+                print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}")
+
+    # XXX: memory usage doubles here (zero2)
+    num_param_groups = len(fp32_flat_groups[0])
+    merged_single_partition_of_fp32_groups = []
+    for i in range(num_param_groups):
+        merged_partitions = [sd[i] for sd in fp32_flat_groups]
+        full_single_fp32_vector = torch.cat(merged_partitions, 0)
+        merged_single_partition_of_fp32_groups.append(full_single_fp32_vector)
+    avail_numel = sum(
+        [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups])
+
+    if debug:
+        wanted_params = sum([len(shapes) for shapes in param_shapes])
+        wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes])
+        # not asserting if there is a mismatch due to possible padding
+        print(f"Have {avail_numel} numels to process.")
+        print(f"Need {wanted_numel} numels in {wanted_params} params.")
+
+    # params
+    # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support
+    # out-of-core computing solution
+    total_numel = 0
+    total_params = 0
+    for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups):
+        offset = 0
+        avail_numel = full_single_fp32_vector.numel()
+        for name, shape in shapes.items():
+
+            unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape)
+            total_numel += unpartitioned_numel
+            total_params += 1
+
+            if debug:
+                print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ")
+            state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape)
+            offset += unpartitioned_numel
+
+        # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and
+        # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex
+        # paddings performed in the code it's almost impossible to predict the exact numbers w/o the
+        # live optimizer object, so we are checking that the numbers are within the right range
+        align_to = 2 * world_size
+
+        def zero2_align(x):
+            return align_to * math.ceil(x / align_to)
+
+        if debug:
+            print(f"original offset={offset}, avail_numel={avail_numel}")
+
+        offset = zero2_align(offset)
+        avail_numel = zero2_align(avail_numel)
+
+        if debug:
+            print(f"aligned  offset={offset}, avail_numel={avail_numel}")
+
+        # Sanity check
+        if offset != avail_numel:
+            raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong")
+
+    print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements")
+
+
+def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+                                               exclude_frozen_parameters):
+    state_dict = OrderedDict()
+
+    # buffers
+    buffers = zero_model_states[0].buffers
+    state_dict.update(buffers)
+    if debug:
+        print(f"added {len(buffers)} buffers")
+
+    if not exclude_frozen_parameters:
+        _zero2_merge_frozen_params(state_dict, zero_model_states)
+
+    _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states)
+
+    # recover shared parameters
+    for pair in zero_model_states[0].shared_params:
+        if pair[1] in state_dict:
+            state_dict[pair[0]] = state_dict[pair[1]]
+
+    return state_dict
+
+
+def zero3_partitioned_param_info(unpartitioned_numel, world_size):
+    remainder = unpartitioned_numel % world_size
+    padding_numel = (world_size - remainder) if remainder else 0
+    partitioned_numel = math.ceil(unpartitioned_numel / world_size)
+    return partitioned_numel, padding_numel
+
+
+def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states):
+    if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0:
+        return
+
+    if debug:
+        for i in range(world_size):
+            num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values())
+            print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}')
+
+        frozen_param_shapes = zero_model_states[0].frozen_param_shapes
+        wanted_params = len(frozen_param_shapes)
+        wanted_numel = sum(s.numel() for s in frozen_param_shapes.values())
+        avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size
+        print(f'Frozen params: Have {avail_numel} numels to process.')
+        print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params')
+
+    total_params = 0
+    total_numel = 0
+    for name, shape in zero_model_states[0].frozen_param_shapes.items():
+        total_params += 1
+        unpartitioned_numel = shape.numel()
+        total_numel += unpartitioned_numel
+
+        param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states)
+        state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape)
+
+        partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size)
+
+        if debug:
+            print(
+                f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}"
+            )
+
+    print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements")
+
+
+class GatheredTensor:
+    """
+    A pseudo tensor that collects partitioned weights.
+    It is more memory efficient when there are multiple groups.
+    """
+
+    def __init__(self, flat_groups, flat_groups_offset, offset, partitioned_numel, shape):
+        self.flat_groups = flat_groups
+        self.flat_groups_offset = flat_groups_offset
+        self.offset = offset
+        self.partitioned_numel = partitioned_numel
+        self.shape = shape
+        self.dtype = self.flat_groups[0][0].dtype
+
+    def contiguous(self):
+        """
+        Merge partitioned weights from flat_groups into a single tensor.
+        """
+        end_idx = self.offset + self.partitioned_numel
+        world_size = len(self.flat_groups)
+        pad_flat_param_chunks = []
+
+        for rank_i in range(world_size):
+            # for each rank, we need to collect weights from related group/groups
+            flat_groups_at_rank_i = self.flat_groups[rank_i]
+            start_group_id = None
+            end_group_id = None
+            for group_id in range(len(self.flat_groups_offset)):
+                if self.flat_groups_offset[group_id] <= self.offset < self.flat_groups_offset[group_id + 1]:
+                    start_group_id = group_id
+                if self.flat_groups_offset[group_id] < end_idx <= self.flat_groups_offset[group_id + 1]:
+                    end_group_id = group_id
+                    break
+            # collect weights from related group/groups
+            for group_id in range(start_group_id, end_group_id + 1):
+                flat_tensor = flat_groups_at_rank_i[group_id]
+                start_offset = self.offset - self.flat_groups_offset[group_id]
+                end_offset = min(end_idx, self.flat_groups_offset[group_id + 1]) - self.flat_groups_offset[group_id]
+                pad_flat_param_chunks.append(flat_tensor[start_offset:end_offset])
+
+        # collect weights from all ranks
+        pad_flat_param = torch.cat(pad_flat_param_chunks, dim=0)
+        param = pad_flat_param[:self.shape.numel()].view(self.shape).contiguous()
+        return param
+
+
+def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states):
+    param_shapes = zero_model_states[0].param_shapes
+    avail_numel = sum([flat_group.numel() for flat_group in fp32_flat_groups[0]]) * world_size
+
+    # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each
+    # param, re-consolidating each param, while dealing with padding if any
+
+    # merge list of dicts, preserving order
+    param_shapes = {k: v for d in param_shapes for k, v in d.items()}
+
+    if debug:
+        for i in range(world_size):
+            print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}")
+
+        wanted_params = len(param_shapes)
+        wanted_numel = sum(shape.numel() for shape in param_shapes.values())
+        # not asserting if there is a mismatch due to possible padding
+        avail_numel = fp32_flat_groups[0].numel() * world_size
+        print(f"Trainable params: Have {avail_numel} numels to process.")
+        print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.")
+
+    # params
+    # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support
+    # out-of-core computing solution
+    offset = 0
+    total_numel = 0
+    total_params = 0
+    flat_groups_offset = [0] + list(np.cumsum([flat_tensor.numel() for flat_tensor in fp32_flat_groups[0]]))
+    for name, shape in tqdm(param_shapes.items(), desc='Gathering sharded weights'):
+        unpartitioned_numel = shape.numel()
+        total_numel += unpartitioned_numel
+        total_params += 1
+        partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size)
+
+        if debug:
+            print(
+                f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}"
+            )
+
+        # memory efficient tensor
+        tensor = GatheredTensor(fp32_flat_groups, flat_groups_offset, offset, partitioned_numel, shape)
+        state_dict[name] = tensor
+        offset += partitioned_numel
+
+    offset *= world_size
+
+    # Sanity check
+    if offset != avail_numel:
+        raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong")
+
+    print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements")
+
+
+def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+                                               exclude_frozen_parameters):
+    state_dict = OrderedDict()
+
+    # buffers
+    buffers = zero_model_states[0].buffers
+    state_dict.update(buffers)
+    if debug:
+        print(f"added {len(buffers)} buffers")
+
+    if not exclude_frozen_parameters:
+        _zero3_merge_frozen_params(state_dict, world_size, zero_model_states)
+
+    _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states)
+
+    # recover shared parameters
+    for pair in zero_model_states[0].shared_params:
+        if pair[1] in state_dict:
+            state_dict[pair[0]] = state_dict[pair[1]]
+
+    return state_dict
+
+
+def to_torch_tensor(state_dict, return_empty_tensor=False):
+    """
+    Convert state_dict of GatheredTensor to torch tensor
+    """
+    torch_state_dict = {}
+    converted_tensors = {}
+    for name, tensor in state_dict.items():
+        tensor_id = id(tensor)
+        if tensor_id in converted_tensors:  # shared tensors
+            shared_tensor = torch_state_dict[converted_tensors[tensor_id]]
+            torch_state_dict[name] = shared_tensor
+        else:
+            converted_tensors[tensor_id] = name
+            if return_empty_tensor:
+                torch_state_dict[name] = torch.empty(tensor.shape, dtype=tensor.dtype)
+            else:
+                torch_state_dict[name] = tensor.contiguous()
+    return torch_state_dict
+
+
+def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir,
+                                             tag=None,
+                                             exclude_frozen_parameters=False,
+                                             lazy_mode=False):
+    """
+    Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with
+    ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example
+    via a model hub.
+
+    Args:
+        - ``checkpoint_dir``: path to the desired checkpoint folder
+        - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14``
+        - ``exclude_frozen_parameters``: exclude frozen parameters
+        - ``lazy_mode``: get state_dict in lazy mode. It returns a dict of pesduo tensor instead of torch tensor, which is more memory efficient.
+          Convert the pesduo tensor to torch tensor by ``.contiguous()``
+
+    Returns:
+        - pytorch ``state_dict``
+
+    A typical usage might be ::
+
+        from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint
+        # do the training and checkpoint saving
+        state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu
+        model = model.cpu() # move to cpu
+        model.load_state_dict(state_dict)
+        # submit to model hub or save the model to share with others
+
+    In this example the ``model`` will no longer be usable in the deepspeed context of the same
+    application. i.e. you will need to re-initialize the deepspeed engine, since
+    ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it.
+
+    If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead.
+
+    Note: the above usage may not work if your application doesn't have sufficient free CPU memory.
+    You may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with
+    the checkpoint. Or you can load state_dict in lazy mode ::
+
+        from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint
+        state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, lazy_mode=True) # not on cpu
+        for name, lazy_tensor in state_dict.item():
+            tensor = lazy_tensor.contiguous()  # to cpu
+            print(name, tensor)
+            # del tensor to release memory if it no longer in use
+    """
+    if tag is None:
+        latest_path = os.path.join(checkpoint_dir, 'latest')
+        if os.path.isfile(latest_path):
+            with open(latest_path, 'r') as fd:
+                tag = fd.read().strip()
+        else:
+            raise ValueError(f"Unable to find 'latest' file at {latest_path}")
+
+    ds_checkpoint_dir = os.path.join(checkpoint_dir, tag)
+
+    if not os.path.isdir(ds_checkpoint_dir):
+        raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist")
+
+    state_dict = _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters)
+    if lazy_mode:
+        return state_dict
+    else:
+        return to_torch_tensor(state_dict)
+
+
+def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir,
+                                               output_dir,
+                                               max_shard_size="5GB",
+                                               safe_serialization=False,
+                                               tag=None,
+                                               exclude_frozen_parameters=False):
+    """
+    Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be
+    loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed.
+
+    Args:
+        - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``)
+        - ``output_dir``: directory to the pytorch fp32 state_dict output files
+        - ``max_shard_size``: the maximum size for a checkpoint before being sharded, default value is 5GB
+        - ``safe_serialization``:  whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).
+        - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14``
+        - ``exclude_frozen_parameters``: exclude frozen parameters
+    """
+
+    # Dependency pre-check
+    if safe_serialization:
+        try:
+            from safetensors.torch import save_file
+        except ImportError:
+            print('If you want to use `safe_serialization`, please `pip install safetensors`')
+            raise
+    if max_shard_size is not None:
+        try:
+            from huggingface_hub import split_torch_state_dict_into_shards
+        except ImportError:
+            print('If you want to use `max_shard_size`, please `pip install huggingface_hub`')
+            raise
+
+    # Convert zero checkpoint to state_dict
+    state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir,
+                                                          tag,
+                                                          exclude_frozen_parameters,
+                                                          lazy_mode=True)
+
+    # Shard the model if it is too big.
+    weights_name = "model.safetensors" if safe_serialization else "pytorch_model.bin"
+    if max_shard_size is not None:
+        filename_pattern = weights_name.replace(".bin", "{suffix}.bin").replace(".safetensors", "{suffix}.safetensors")
+        # an memory-efficient approach for sharding
+        empty_state_dict = to_torch_tensor(state_dict, return_empty_tensor=True)
+        state_dict_split = split_torch_state_dict_into_shards(empty_state_dict,
+                                                              filename_pattern=filename_pattern,
+                                                              max_shard_size=max_shard_size)
+    else:
+        from collections import namedtuple
+        StateDictSplit = namedtuple("StateDictSplit", ["is_sharded", "filename_to_tensors"])
+        state_dict_split = StateDictSplit(is_sharded=False,
+                                          filename_to_tensors={weights_name: list(state_dict.keys())})
+
+    # Save the model by shard
+    os.makedirs(output_dir, exist_ok=True)
+    filename_to_tensors = state_dict_split.filename_to_tensors.items()
+    for shard_file, tensors in tqdm(filename_to_tensors, desc="Saving checkpoint shards"):
+        shard_state_dict = {tensor_name: state_dict[tensor_name] for tensor_name in tensors}
+        shard_state_dict = to_torch_tensor(shard_state_dict)
+        output_path = os.path.join(output_dir, shard_file)
+        if safe_serialization:
+            save_file(shard_state_dict, output_path, metadata={"format": "pt"})
+        else:
+            torch.save(shard_state_dict, output_path)
+        # release the memory of current shard
+        for tensor_name in list(shard_state_dict.keys()):
+            del state_dict[tensor_name]
+            del shard_state_dict[tensor_name]
+        del shard_state_dict
+        gc.collect()
+
+    # Save index if sharded
+    if state_dict_split.is_sharded:
+        index = {
+            "metadata": state_dict_split.metadata,
+            "weight_map": state_dict_split.tensor_to_filename,
+        }
+        save_index_file = "model.safetensors.index.json" if safe_serialization else "pytorch_model.bin.index.json"
+        save_index_file = os.path.join(output_dir, save_index_file)
+        with open(save_index_file, "w", encoding="utf-8") as f:
+            content = json.dumps(index, indent=2, sort_keys=True) + "\n"
+            f.write(content)
+
+
+def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None):
+    """
+    1. Put the provided model to cpu
+    2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict``
+    3. Load it into the provided model
+
+    Args:
+        - ``model``: the model object to update
+        - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``)
+        - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14``
+
+    Returns:
+        - ``model`: modified model
+
+    Make sure you have plenty of CPU memory available before you call this function. If you don't
+    have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it
+    conveniently placed for you in the checkpoint folder.
+
+    A typical usage might be ::
+
+        from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint
+        model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir)
+        # submit to model hub or save the model to share with others
+
+    Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context
+    of the same application. i.e. you will need to re-initialize the deepspeed engine, since
+    ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it.
+
+    """
+    logger.info("Extracting fp32 weights")
+    state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag)
+
+    logger.info("Overwriting model with fp32 weights")
+    model = model.cpu()
+    model.load_state_dict(state_dict, strict=False)
+
+    return model
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("checkpoint_dir",
+                        type=str,
+                        help="path to the desired checkpoint folder, e.g., path/checkpoint-12")
+    parser.add_argument("output_dir",
+                        type=str,
+                        help="directory to the pytorch fp32 state_dict output files"
+                        "(e.g. path/checkpoint-12-output/)")
+    parser.add_argument(
+        "--max_shard_size",
+        type=str,
+        default="5GB",
+        help="The maximum size for a checkpoint before being sharded. Checkpoints shard will then be each of size"
+        "lower than this size. If expressed as a string, needs to be digits followed by a unit (like `5MB`"
+        "We default it to 5GB in order for models to be able to run easily on free-tier google colab instances"
+        "without CPU OOM issues.")
+    parser.add_argument(
+        "--safe_serialization",
+        default=False,
+        action='store_true',
+        help="Whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).")
+    parser.add_argument("-t",
+                        "--tag",
+                        type=str,
+                        default=None,
+                        help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1")
+    parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters")
+    parser.add_argument("-d", "--debug", action='store_true', help="enable debug")
+    args = parser.parse_args()
+
+    debug = args.debug
+
+    convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir,
+                                               args.output_dir,
+                                               max_shard_size=args.max_shard_size,
+                                               safe_serialization=args.safe_serialization,
+                                               tag=args.tag,
+                                               exclude_frozen_parameters=args.exclude_frozen_parameters)
diff --git a/dapo_lora_plus_20251202_001141/checkpoint-320/README.md b/dapo_lora_plus_20251202_001141/checkpoint-320/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..b3fac4aca7a7fabb3a0972e6c9281e23853e2816
--- /dev/null
+++ b/dapo_lora_plus_20251202_001141/checkpoint-320/README.md
@@ -0,0 +1,209 @@
+---
+base_model: deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B
+- grpo
+- lora
+- transformers
+- trl
+---
+
+# Model Card for Model ID
+
+<!-- Provide a quick summary of what the model is/does. -->
+
+
+
+## Model Details
+
+### Model Description
+
+<!-- Provide a longer summary of what this model is. -->
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+<!-- Provide the basic links for the model. -->
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+
+### Direct Use
+
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+
+[More Information Needed]
+
+### Recommendations
+
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+
+[More Information Needed]
+
+### Training Procedure
+
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+
+#### Speeds, Sizes, Times [optional]
+
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+
+[More Information Needed]
+
+## Evaluation
+
+<!-- This section describes the evaluation protocols and provides the results. -->
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+<!-- This should link to a Dataset Card if possible. -->
+
+[More Information Needed]
+
+#### Factors
+
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+
+[More Information Needed]
+
+#### Metrics
+
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+<!-- Relevant interpretability work for the model goes here -->
+
+[More Information Needed]
+
+## Environmental Impact
+
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.17.1
\ No newline at end of file
diff --git a/dapo_lora_plus_20251202_001141/checkpoint-320/adapter_config.json b/dapo_lora_plus_20251202_001141/checkpoint-320/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..57b1340e85011632bb78b2fd3b13b455f6b0d622
--- /dev/null
+++ b/dapo_lora_plus_20251202_001141/checkpoint-320/adapter_config.json
@@ -0,0 +1,42 @@
+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B",
+  "bias": "none",
+  "corda_config": null,
+  "eva_config": null,
+  "exclude_modules": null,
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 64,
+  "lora_bias": false,
+  "lora_dropout": 0.05,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "qalora_group_size": 16,
+  "r": 32,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "q_proj",
+    "v_proj",
+    "k_proj",
+    "gate_proj",
+    "down_proj",
+    "up_proj",
+    "o_proj"
+  ],
+  "target_parameters": null,
+  "task_type": "CAUSAL_LM",
+  "trainable_token_indices": null,
+  "use_dora": false,
+  "use_qalora": false,
+  "use_rslora": false
+}
\ No newline at end of file
diff --git a/dapo_lora_plus_20251202_001141/checkpoint-320/chat_template.jinja b/dapo_lora_plus_20251202_001141/checkpoint-320/chat_template.jinja
new file mode 100644
index 0000000000000000000000000000000000000000..c2066bd7391c270626e39c9d7124f00360126412
--- /dev/null
+++ b/dapo_lora_plus_20251202_001141/checkpoint-320/chat_template.jinja
@@ -0,0 +1 @@
+{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% set ns = namespace(is_first=false, is_tool=false, is_output_first=true, system_prompt='') %}{%- for message in messages %}{%- if message['role'] == 'system' %}{% set ns.system_prompt = message['content'] %}{%- endif %}{%- endfor %}{{bos_token}}{{ns.system_prompt}}{%- for message in messages %}{%- if message['role'] == 'user' %}{%- set ns.is_tool = false -%}{{'<｜User｜>' + message['content']}}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is none %}{%- set ns.is_tool = false -%}{%- for tool in message['tool_calls']%}{%- if not ns.is_first %}{{'<｜Assistant｜><｜tool▁calls▁begin｜><｜tool▁call▁begin｜>' + tool['type'] + '<｜tool▁sep｜>' + tool['function']['name'] + '\n' + '```json' + '\n' + tool['function']['arguments'] + '\n' + '```' + '<｜tool▁call▁end｜>'}}{%- set ns.is_first = true -%}{%- else %}{{'\n' + '<｜tool▁call▁begin｜>' + tool['type'] + '<｜tool▁sep｜>' + tool['function']['name'] + '\n' + '```json' + '\n' + tool['function']['arguments'] + '\n' + '```' + '<｜tool▁call▁end｜>'}}{{'<｜tool▁calls▁end｜><｜end▁of▁sentence｜>'}}{%- endif %}{%- endfor %}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is not none %}{%- if ns.is_tool %}{{'<｜tool▁outputs▁end｜>' + message['content'] + '<｜end▁of▁sentence｜>'}}{%- set ns.is_tool = false -%}{%- else %}{% set content = message['content'] %}{% if '</think>' in content %}{% set content = content.split('</think>')[-1] %}{% endif %}{{'<｜Assistant｜>' + content + '<｜end▁of▁sentence｜>'}}{%- endif %}{%- endif %}{%- if message['role'] == 'tool' %}{%- set ns.is_tool = true -%}{%- if ns.is_output_first %}{{'<｜tool▁outputs▁begin｜><｜tool▁output▁begin｜>' + message['content'] + '<｜tool▁output▁end｜>'}}{%- set ns.is_output_first = false %}{%- else %}{{'\n<｜tool▁output▁begin｜>' + message['content'] + '<｜tool▁output▁end｜>'}}{%- endif %}{%- endif %}{%- endfor -%}{% if ns.is_tool %}{{'<｜tool▁outputs▁end｜>'}}{% endif %}{% if add_generation_prompt and not ns.is_tool %}{{'<｜Assistant｜><think>\n'}}{% endif %}
\ No newline at end of file
diff --git a/dapo_lora_plus_20251202_001141/checkpoint-320/latest b/dapo_lora_plus_20251202_001141/checkpoint-320/latest
new file mode 100644
index 0000000000000000000000000000000000000000..9d535587efdab3121736d8095481e4143f000213
--- /dev/null
+++ b/dapo_lora_plus_20251202_001141/checkpoint-320/latest
@@ -0,0 +1 @@
+global_step320
\ No newline at end of file
diff --git a/dapo_lora_plus_20251202_001141/checkpoint-320/special_tokens_map.json b/dapo_lora_plus_20251202_001141/checkpoint-320/special_tokens_map.json
new file mode 100644
index 0000000000000000000000000000000000000000..1d385d62cf08bca35254547902b792c243656ec1
--- /dev/null
+++ b/dapo_lora_plus_20251202_001141/checkpoint-320/special_tokens_map.json
@@ -0,0 +1,23 @@
+{
+  "bos_token": {
+    "content": "<｜begin▁of▁sentence｜>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "<｜end▁of▁sentence｜>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<｜end▁of▁sentence｜>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}
diff --git a/dapo_lora_plus_20251202_001141/checkpoint-320/tokenizer_config.json b/dapo_lora_plus_20251202_001141/checkpoint-320/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..d252dd4e5764106823080946500c02a8ed8c90c9
--- /dev/null
+++ b/dapo_lora_plus_20251202_001141/checkpoint-320/tokenizer_config.json
@@ -0,0 +1,194 @@
+{
+  "add_bos_token": true,
+  "add_eos_token": false,
+  "add_prefix_space": null,
+  "added_tokens_decoder": {
+    "151643": {
+      "content": "<｜end▁of▁sentence｜>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151644": {
+      "content": "<｜User｜>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151645": {
+      "content": "<｜Assistant｜>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151646": {
+      "content": "<｜begin▁of▁sentence｜>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151647": {
+      "content": "<|EOT|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151648": {
+      "content": "<think>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151649": {
+      "content": "</think>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151650": {
+      "content": "<|quad_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151651": {
+      "content": "<|quad_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151652": {
+      "content": "<|vision_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151653": {
+      "content": "<|vision_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151654": {
+      "content": "<|vision_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151655": {
+      "content": "<|image_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151656": {
+      "content": "<|video_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151657": {
+      "content": "<tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151658": {
+      "content": "</tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151659": {
+      "content": "<|fim_prefix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151660": {
+      "content": "<|fim_middle|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151661": {
+      "content": "<|fim_suffix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151662": {
+      "content": "<|fim_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151663": {
+      "content": "<|repo_name|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151664": {
+      "content": "<|file_sep|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    }
+  },
+  "bos_token": "<｜begin▁of▁sentence｜>",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<｜end▁of▁sentence｜>",
+  "extra_special_tokens": {},
+  "legacy": true,
+  "model_max_length": 16384,
+  "pad_token": "<｜end▁of▁sentence｜>",
+  "sp_model_kwargs": {},
+  "tokenizer_class": "LlamaTokenizerFast",
+  "unk_token": null,
+  "use_default_system_prompt": false
+}
diff --git a/dapo_lora_plus_20251202_001141/checkpoint-320/trainer_state.json b/dapo_lora_plus_20251202_001141/checkpoint-320/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..8eaf34db285507204c1e5ddd562e56437a34ba41
--- /dev/null
+++ b/dapo_lora_plus_20251202_001141/checkpoint-320/trainer_state.json
@@ -0,0 +1,9954 @@
+{
+  "best_global_step": null,
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 0.29438822447102114,
+  "eval_steps": 500,
+  "global_step": 320,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15689.0,
+      "completions/max_terminated_length": 15689.0,
+      "completions/mean_length": 6039.171875,
+      "completions/mean_terminated_length": 6039.171875,
+      "completions/min_length": 250.0,
+      "completions/min_terminated_length": 250.0,
+      "entropy": 1.19118632376194,
+      "epoch": 0.0009199632014719411,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0025745572056621313,
+      "learning_rate": 1e-05,
+      "loss": 0.0591,
+      "num_tokens": 792270.0,
+      "reward": 0.25,
+      "reward_std": 0.24435341358184814,
+      "rewards/accuracy_reward/mean": 0.25,
+      "rewards/accuracy_reward/std": 0.434714138507843,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999940395355225,
+      "sampling/importance_sampling_ratio/min": 0.0002457273658365011,
+      "sampling/sampling_logp_difference/max": 8.311287879943848,
+      "sampling/sampling_logp_difference/mean": 0.021642697975039482,
+      "step": 1
+    },
+    {
+      "clip_ratio/high_max": 5.499582130141789e-06,
+      "clip_ratio/high_mean": 1.3748955325354473e-06,
+      "clip_ratio/low_mean": 2.871888784738985e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.009378326623846e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 16292.0,
+      "completions/max_terminated_length": 16292.0,
+      "completions/mean_length": 4767.1875,
+      "completions/mean_terminated_length": 4767.1875,
+      "completions/min_length": 556.0,
+      "completions/min_terminated_length": 556.0,
+      "entropy": 1.088237851858139,
+      "epoch": 0.0018399264029438822,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.002068034838885069,
+      "learning_rate": 1e-05,
+      "loss": 0.0258,
+      "num_tokens": 1425798.0,
+      "reward": 0.3046875,
+      "reward_std": 0.29036980867385864,
+      "rewards/accuracy_reward/mean": 0.3046875,
+      "rewards/accuracy_reward/std": 0.46208351850509644,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999016523361206,
+      "sampling/importance_sampling_ratio/min": 0.01811397261917591,
+      "sampling/sampling_logp_difference/max": 4.011071681976318,
+      "sampling/sampling_logp_difference/mean": 0.01877593621611595,
+      "step": 2
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 4.459846724103045e-05,
+      "clip_ratio/low_min": 3.4060874440910993e-06,
+      "clip_ratio/region_mean": 4.459846724103045e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16317.0,
+      "completions/mean_length": 6586.359375,
+      "completions/mean_terminated_length": 6351.21630859375,
+      "completions/min_length": 4.0,
+      "completions/min_terminated_length": 4.0,
+      "entropy": 1.0497623533010483,
+      "epoch": 0.0027598896044158236,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.001971944235265255,
+      "learning_rate": 1e-05,
+      "loss": 0.0199,
+      "num_tokens": 2287420.0,
+      "reward": 0.28125,
+      "reward_std": 0.29143062233924866,
+      "rewards/accuracy_reward/mean": 0.28125,
+      "rewards/accuracy_reward/std": 0.4513758420944214,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999316334724426,
+      "sampling/importance_sampling_ratio/min": 5.356698966352269e-05,
+      "sampling/sampling_logp_difference/max": 9.834577560424805,
+      "sampling/sampling_logp_difference/mean": 0.02137824520468712,
+      "step": 3
+    },
+    {
+      "clip_ratio/high_max": 1.7640652004047297e-05,
+      "clip_ratio/high_mean": 5.48578327652649e-06,
+      "clip_ratio/low_mean": 3.218628648937738e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.767206976590387e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 14690.0,
+      "completions/max_terminated_length": 14690.0,
+      "completions/mean_length": 5448.0234375,
+      "completions/mean_terminated_length": 5448.0234375,
+      "completions/min_length": 707.0,
+      "completions/min_terminated_length": 707.0,
+      "entropy": 1.1134418621659279,
+      "epoch": 0.0036798528058877645,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0016465173102915287,
+      "learning_rate": 1e-05,
+      "loss": 0.0433,
+      "num_tokens": 3009167.0,
+      "reward": 0.2890625,
+      "reward_std": 0.27958330512046814,
+      "rewards/accuracy_reward/mean": 0.2890625,
+      "rewards/accuracy_reward/std": 0.45510825514793396,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999774694442749,
+      "sampling/importance_sampling_ratio/min": 7.889385415182915e-06,
+      "sampling/sampling_logp_difference/max": 11.749992370605469,
+      "sampling/sampling_logp_difference/mean": 0.020580951124429703,
+      "step": 4
+    },
+    {
+      "clip_ratio/high_max": 1.3439519989333348e-05,
+      "clip_ratio/high_mean": 3.359879997333337e-06,
+      "clip_ratio/low_mean": 2.8849915906903334e-05,
+      "clip_ratio/low_min": 8.467687621305231e-06,
+      "clip_ratio/region_mean": 3.220979442630778e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 13420.0,
+      "completions/mean_length": 5436.8671875,
+      "completions/mean_terminated_length": 5350.66943359375,
+      "completions/min_length": 26.0,
+      "completions/min_terminated_length": 26.0,
+      "entropy": 1.1473859176039696,
+      "epoch": 0.004599816007359705,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0023770295083522797,
+      "learning_rate": 1e-05,
+      "loss": 0.0153,
+      "num_tokens": 3725654.0,
+      "reward": 0.2734375,
+      "reward_std": 0.27434611320495605,
+      "rewards/accuracy_reward/mean": 0.2734375,
+      "rewards/accuracy_reward/std": 0.447474867105484,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.99991774559021,
+      "sampling/importance_sampling_ratio/min": 0.0011146117467433214,
+      "sampling/sampling_logp_difference/max": 6.799249172210693,
+      "sampling/sampling_logp_difference/mean": 0.020377254113554955,
+      "step": 5
+    },
+    {
+      "clip_ratio/high_max": 4.652201369026443e-06,
+      "clip_ratio/high_mean": 1.1630503422566107e-06,
+      "clip_ratio/low_mean": 2.8399212624208303e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.9562263534899103e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 14440.0,
+      "completions/max_terminated_length": 14440.0,
+      "completions/mean_length": 4697.5390625,
+      "completions/mean_terminated_length": 4697.5390625,
+      "completions/min_length": 445.0,
+      "completions/min_terminated_length": 445.0,
+      "entropy": 1.0097229778766632,
+      "epoch": 0.005519779208831647,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.003342699259519577,
+      "learning_rate": 1e-05,
+      "loss": 0.0326,
+      "num_tokens": 4345547.0,
+      "reward": 0.390625,
+      "reward_std": 0.34480881690979004,
+      "rewards/accuracy_reward/mean": 0.390625,
+      "rewards/accuracy_reward/std": 0.4898075461387634,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999914765357971,
+      "sampling/importance_sampling_ratio/min": 0.002385853324085474,
+      "sampling/sampling_logp_difference/max": 6.038198471069336,
+      "sampling/sampling_logp_difference/mean": 0.0185473021119833,
+      "step": 6
+    },
+    {
+      "clip_ratio/high_max": 9.362594937556423e-06,
+      "clip_ratio/high_mean": 2.340648734389106e-06,
+      "clip_ratio/low_mean": 6.054362825125281e-05,
+      "clip_ratio/low_min": 7.427356649714056e-06,
+      "clip_ratio/region_mean": 6.288427744038927e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14652.0,
+      "completions/mean_length": 6218.2109375,
+      "completions/mean_terminated_length": 5890.2822265625,
+      "completions/min_length": 156.0,
+      "completions/min_terminated_length": 156.0,
+      "entropy": 1.0579778030514717,
+      "epoch": 0.006439742410303588,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.002073560608550906,
+      "learning_rate": 1e-05,
+      "loss": 0.0201,
+      "num_tokens": 5160646.0,
+      "reward": 0.2109375,
+      "reward_std": 0.27222445607185364,
+      "rewards/accuracy_reward/mean": 0.2109375,
+      "rewards/accuracy_reward/std": 0.4095771610736847,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999560117721558,
+      "sampling/importance_sampling_ratio/min": 0.00044544730917550623,
+      "sampling/sampling_logp_difference/max": 7.716431617736816,
+      "sampling/sampling_logp_difference/mean": 0.020321575924754143,
+      "step": 7
+    },
+    {
+      "clip_ratio/high_max": 1.1064067621191498e-05,
+      "clip_ratio/high_mean": 2.7660169052978745e-06,
+      "clip_ratio/low_mean": 2.2175867059104348e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.4941883737028547e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 13637.0,
+      "completions/mean_length": 5127.8359375,
+      "completions/mean_terminated_length": 5039.20458984375,
+      "completions/min_length": 556.0,
+      "completions/min_terminated_length": 556.0,
+      "entropy": 1.0472618415951729,
+      "epoch": 0.007359705611775529,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0032994600478559732,
+      "learning_rate": 1e-05,
+      "loss": 0.0751,
+      "num_tokens": 5836289.0,
+      "reward": 0.3359375,
+      "reward_std": 0.2948455810546875,
+      "rewards/accuracy_reward/mean": 0.3359375,
+      "rewards/accuracy_reward/std": 0.47417303919792175,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999483227729797,
+      "sampling/importance_sampling_ratio/min": 0.0013780994340777397,
+      "sampling/sampling_logp_difference/max": 6.587049961090088,
+      "sampling/sampling_logp_difference/mean": 0.01940803974866867,
+      "step": 8
+    },
+    {
+      "clip_ratio/high_max": 1.2357884770608507e-05,
+      "clip_ratio/high_mean": 3.0894711926521268e-06,
+      "clip_ratio/low_mean": 3.000627111759968e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.309574231025181e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15916.0,
+      "completions/mean_length": 4516.890625,
+      "completions/mean_terminated_length": 4423.44873046875,
+      "completions/min_length": 238.0,
+      "completions/min_terminated_length": 238.0,
+      "entropy": 0.911251038312912,
+      "epoch": 0.00827966881324747,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.003016560571268201,
+      "learning_rate": 1e-05,
+      "loss": 0.1006,
+      "num_tokens": 6433171.0,
+      "reward": 0.390625,
+      "reward_std": 0.3066929578781128,
+      "rewards/accuracy_reward/mean": 0.390625,
+      "rewards/accuracy_reward/std": 0.4898075461387634,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999179840087891,
+      "sampling/importance_sampling_ratio/min": 0.005480794236063957,
+      "sampling/sampling_logp_difference/max": 5.206505298614502,
+      "sampling/sampling_logp_difference/mean": 0.017437148839235306,
+      "step": 9
+    },
+    {
+      "clip_ratio/high_max": 4.6329013457580004e-05,
+      "clip_ratio/high_mean": 1.1582253364395001e-05,
+      "clip_ratio/low_mean": 7.069455705277505e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 8.227681109929108e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 13970.0,
+      "completions/mean_length": 4961.453125,
+      "completions/mean_terminated_length": 4687.31201171875,
+      "completions/min_length": 483.0,
+      "completions/min_terminated_length": 483.0,
+      "entropy": 0.6808596402406693,
+      "epoch": 0.00919963201471941,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.0035386616364121437,
+      "learning_rate": 1e-05,
+      "loss": 0.0596,
+      "num_tokens": 7085389.0,
+      "reward": 0.5625,
+      "reward_std": 0.3816363215446472,
+      "rewards/accuracy_reward/mean": 0.5625,
+      "rewards/accuracy_reward/std": 0.49802759289741516,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999173879623413,
+      "sampling/importance_sampling_ratio/min": 0.0002734088629949838,
+      "sampling/sampling_logp_difference/max": 8.20454216003418,
+      "sampling/sampling_logp_difference/mean": 0.01566406339406967,
+      "step": 10
+    },
+    {
+      "clip_ratio/high_max": 2.43190661421977e-05,
+      "clip_ratio/high_mean": 6.079766535549425e-06,
+      "clip_ratio/low_mean": 2.2395396172214532e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.8475162707763957e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14776.0,
+      "completions/mean_length": 4429.40625,
+      "completions/mean_terminated_length": 4335.275390625,
+      "completions/min_length": 25.0,
+      "completions/min_terminated_length": 25.0,
+      "entropy": 0.9181502386927605,
+      "epoch": 0.010119595216191352,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.0022535293828696012,
+      "learning_rate": 1e-05,
+      "loss": 0.0031,
+      "num_tokens": 7672185.0,
+      "reward": 0.3671875,
+      "reward_std": 0.20357418060302734,
+      "rewards/accuracy_reward/mean": 0.3671875,
+      "rewards/accuracy_reward/std": 0.4839322865009308,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998801946640015,
+      "sampling/importance_sampling_ratio/min": 5.315856554943821e-08,
+      "sampling/sampling_logp_difference/max": 16.74998664855957,
+      "sampling/sampling_logp_difference/mean": 0.018429335206747055,
+      "step": 11
+    },
+    {
+      "clip_ratio/high_max": 1.0117325928149512e-05,
+      "clip_ratio/high_mean": 2.529331482037378e-06,
+      "clip_ratio/low_mean": 1.1982813475697185e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 1.45121450714214e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14029.0,
+      "completions/mean_length": 5282.6796875,
+      "completions/mean_terminated_length": 5106.46875,
+      "completions/min_length": 323.0,
+      "completions/min_terminated_length": 323.0,
+      "entropy": 1.113751620054245,
+      "epoch": 0.011039558417663294,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0013591813622042537,
+      "learning_rate": 1e-05,
+      "loss": 0.0971,
+      "num_tokens": 8369000.0,
+      "reward": 0.3984375,
+      "reward_std": 0.3029736578464508,
+      "rewards/accuracy_reward/mean": 0.3984375,
+      "rewards/accuracy_reward/std": 0.4915000796318054,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998897314071655,
+      "sampling/importance_sampling_ratio/min": 3.970265970565379e-05,
+      "sampling/sampling_logp_difference/max": 10.134092330932617,
+      "sampling/sampling_logp_difference/mean": 0.020221836864948273,
+      "step": 12
+    },
+    {
+      "clip_ratio/high_max": 5.411958227341529e-06,
+      "clip_ratio/high_mean": 1.3529895568353822e-06,
+      "clip_ratio/low_mean": 2.5284593846208736e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.6637583516730956e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15925.0,
+      "completions/mean_length": 6970.421875,
+      "completions/mean_terminated_length": 6744.49609375,
+      "completions/min_length": 53.0,
+      "completions/min_terminated_length": 53.0,
+      "entropy": 1.1721933633089066,
+      "epoch": 0.011959521619135235,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.0024079051800072193,
+      "learning_rate": 1e-05,
+      "loss": 0.0713,
+      "num_tokens": 9283182.0,
+      "reward": 0.171875,
+      "reward_std": 0.17965975403785706,
+      "rewards/accuracy_reward/mean": 0.171875,
+      "rewards/accuracy_reward/std": 0.3787541687488556,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999163746833801,
+      "sampling/importance_sampling_ratio/min": 0.0008915197686292231,
+      "sampling/sampling_logp_difference/max": 7.0225830078125,
+      "sampling/sampling_logp_difference/mean": 0.021462474018335342,
+      "step": 13
+    },
+    {
+      "clip_ratio/high_max": 2.0661535927501973e-05,
+      "clip_ratio/high_mean": 5.165383981875493e-06,
+      "clip_ratio/low_mean": 2.4304956298237812e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.947033948430544e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 14658.0,
+      "completions/max_terminated_length": 14658.0,
+      "completions/mean_length": 4886.875,
+      "completions/mean_terminated_length": 4886.875,
+      "completions/min_length": 434.0,
+      "completions/min_terminated_length": 434.0,
+      "entropy": 1.0108910650014877,
+      "epoch": 0.012879484820607176,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.002063734456896782,
+      "learning_rate": 1e-05,
+      "loss": 0.0386,
+      "num_tokens": 9928446.0,
+      "reward": 0.3515625,
+      "reward_std": 0.2409384697675705,
+      "rewards/accuracy_reward/mean": 0.3515625,
+      "rewards/accuracy_reward/std": 0.4793342351913452,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000026226043701,
+      "sampling/importance_sampling_ratio/min": 0.0003672837920021266,
+      "sampling/sampling_logp_difference/max": 7.9093756675720215,
+      "sampling/sampling_logp_difference/mean": 0.01918785460293293,
+      "step": 14
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 4.4761846993424115e-06,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.4761846993424115e-06,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 12992.0,
+      "completions/max_terminated_length": 12992.0,
+      "completions/mean_length": 4824.0078125,
+      "completions/mean_terminated_length": 4824.0078125,
+      "completions/min_length": 103.0,
+      "completions/min_terminated_length": 103.0,
+      "entropy": 1.1070282831788063,
+      "epoch": 0.013799448022079117,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.002424790756776929,
+      "learning_rate": 1e-05,
+      "loss": 0.0485,
+      "num_tokens": 10566415.0,
+      "reward": 0.28125,
+      "reward_std": 0.23698672652244568,
+      "rewards/accuracy_reward/mean": 0.28125,
+      "rewards/accuracy_reward/std": 0.4513758420944214,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000125169754028,
+      "sampling/importance_sampling_ratio/min": 0.0011708867968991399,
+      "sampling/sampling_logp_difference/max": 6.749993801116943,
+      "sampling/sampling_logp_difference/mean": 0.02069389820098877,
+      "step": 15
+    },
+    {
+      "clip_ratio/high_max": 3.5075904634140898e-06,
+      "clip_ratio/high_mean": 8.768976158535224e-07,
+      "clip_ratio/low_mean": 2.2676964135825983e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.3553861751679506e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 12685.0,
+      "completions/mean_length": 5449.4140625,
+      "completions/mean_terminated_length": 5363.31494140625,
+      "completions/min_length": 23.0,
+      "completions/min_terminated_length": 23.0,
+      "entropy": 0.9817888736724854,
+      "epoch": 0.014719411223551058,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0021046048495918512,
+      "learning_rate": 1e-05,
+      "loss": 0.0252,
+      "num_tokens": 11281908.0,
+      "reward": 0.2265625,
+      "reward_std": 0.27168765664100647,
+      "rewards/accuracy_reward/mean": 0.2265625,
+      "rewards/accuracy_reward/std": 0.4202519655227661,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999805688858032,
+      "sampling/importance_sampling_ratio/min": 0.013273254036903381,
+      "sampling/sampling_logp_difference/max": 4.322004318237305,
+      "sampling/sampling_logp_difference/mean": 0.019556276500225067,
+      "step": 16
+    },
+    {
+      "clip_ratio/high_max": 1.624216065465589e-05,
+      "clip_ratio/high_mean": 4.060540163663973e-06,
+      "clip_ratio/low_mean": 5.4349347919924185e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.840988796990132e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 14133.0,
+      "completions/max_terminated_length": 14133.0,
+      "completions/mean_length": 5343.25,
+      "completions/mean_terminated_length": 5343.25,
+      "completions/min_length": 324.0,
+      "completions/min_terminated_length": 324.0,
+      "entropy": 1.04741720110178,
+      "epoch": 0.015639374425023,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0035894038155674934,
+      "learning_rate": 1e-05,
+      "loss": 0.0584,
+      "num_tokens": 11987692.0,
+      "reward": 0.3359375,
+      "reward_std": 0.3124620020389557,
+      "rewards/accuracy_reward/mean": 0.3359375,
+      "rewards/accuracy_reward/std": 0.47417303919792175,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998996257781982,
+      "sampling/importance_sampling_ratio/min": 2.1446165192173794e-05,
+      "sampling/sampling_logp_difference/max": 10.749964714050293,
+      "sampling/sampling_logp_difference/mean": 0.020530637353658676,
+      "step": 17
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 4.272115029380075e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.272115029380075e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15138.0,
+      "completions/mean_length": 6301.9375,
+      "completions/mean_terminated_length": 5806.09814453125,
+      "completions/min_length": 72.0,
+      "completions/min_terminated_length": 72.0,
+      "entropy": 0.8892941772937775,
+      "epoch": 0.01655933762649494,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0032246762420982122,
+      "learning_rate": 1e-05,
+      "loss": 0.0811,
+      "num_tokens": 12814244.0,
+      "reward": 0.3125,
+      "reward_std": 0.3606000542640686,
+      "rewards/accuracy_reward/mean": 0.3125,
+      "rewards/accuracy_reward/std": 0.4653336703777313,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999184608459473,
+      "sampling/importance_sampling_ratio/min": 0.021351110190153122,
+      "sampling/sampling_logp_difference/max": 3.846651554107666,
+      "sampling/sampling_logp_difference/mean": 0.017541853711009026,
+      "step": 18
+    },
+    {
+      "clip_ratio/high_max": 9.956602298188955e-06,
+      "clip_ratio/high_mean": 2.4891505745472386e-06,
+      "clip_ratio/low_mean": 2.772165316855535e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.0210803743102588e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 16213.0,
+      "completions/max_terminated_length": 16213.0,
+      "completions/mean_length": 5297.46875,
+      "completions/mean_terminated_length": 5297.46875,
+      "completions/min_length": 2.0,
+      "completions/min_terminated_length": 2.0,
+      "entropy": 0.8097029253840446,
+      "epoch": 0.017479300827966882,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0023969109170138836,
+      "learning_rate": 1e-05,
+      "loss": -0.0153,
+      "num_tokens": 13512520.0,
+      "reward": 0.359375,
+      "reward_std": 0.248829185962677,
+      "rewards/accuracy_reward/mean": 0.359375,
+      "rewards/accuracy_reward/std": 0.481702595949173,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999222159385681,
+      "sampling/importance_sampling_ratio/min": 0.005766105372458696,
+      "sampling/sampling_logp_difference/max": 5.155758380889893,
+      "sampling/sampling_logp_difference/mean": 0.017464376986026764,
+      "step": 19
+    },
+    {
+      "clip_ratio/high_max": 1.0098337497765897e-05,
+      "clip_ratio/high_mean": 2.524584374441474e-06,
+      "clip_ratio/low_mean": 3.173396362399217e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.425854845318099e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14655.0,
+      "completions/mean_length": 4890.34375,
+      "completions/mean_terminated_length": 4799.84228515625,
+      "completions/min_length": 68.0,
+      "completions/min_terminated_length": 68.0,
+      "entropy": 0.9267145916819572,
+      "epoch": 0.01839926402943882,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.002759338356554508,
+      "learning_rate": 1e-05,
+      "loss": -0.0014,
+      "num_tokens": 14155556.0,
+      "reward": 0.3515625,
+      "reward_std": 0.31010788679122925,
+      "rewards/accuracy_reward/mean": 0.3515625,
+      "rewards/accuracy_reward/std": 0.4793342351913452,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999570250511169,
+      "sampling/importance_sampling_ratio/min": 0.008491010405123234,
+      "sampling/sampling_logp_difference/max": 4.768747329711914,
+      "sampling/sampling_logp_difference/mean": 0.018839433789253235,
+      "step": 20
+    },
+    {
+      "clip_ratio/high_max": 7.532389190600952e-06,
+      "clip_ratio/high_mean": 1.883097297650238e-06,
+      "clip_ratio/low_mean": 1.9051809317716106e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.0934906729053182e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 16296.0,
+      "completions/max_terminated_length": 16296.0,
+      "completions/mean_length": 4609.40625,
+      "completions/mean_terminated_length": 4609.40625,
+      "completions/min_length": 461.0,
+      "completions/min_terminated_length": 461.0,
+      "entropy": 1.171089917421341,
+      "epoch": 0.019319227230910764,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0021055075339972973,
+      "learning_rate": 1e-05,
+      "loss": -0.0051,
+      "num_tokens": 14765328.0,
+      "reward": 0.2421875,
+      "reward_std": 0.2409384548664093,
+      "rewards/accuracy_reward/mean": 0.2421875,
+      "rewards/accuracy_reward/std": 0.4300905168056488,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999741911888123,
+      "sampling/importance_sampling_ratio/min": 5.368983693188056e-07,
+      "sampling/sampling_logp_difference/max": 14.437457084655762,
+      "sampling/sampling_logp_difference/mean": 0.020226795226335526,
+      "step": 21
+    },
+    {
+      "clip_ratio/high_max": 1.7169573766295798e-05,
+      "clip_ratio/high_mean": 4.2923934415739495e-06,
+      "clip_ratio/low_mean": 5.869748633813288e-06,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 1.0162142189074075e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14299.0,
+      "completions/mean_length": 5099.0390625,
+      "completions/mean_terminated_length": 5010.18115234375,
+      "completions/min_length": 539.0,
+      "completions/min_terminated_length": 539.0,
+      "entropy": 1.005959376692772,
+      "epoch": 0.020239190432382703,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0027595218271017075,
+      "learning_rate": 1e-05,
+      "loss": 0.0236,
+      "num_tokens": 15438549.0,
+      "reward": 0.296875,
+      "reward_std": 0.20069602131843567,
+      "rewards/accuracy_reward/mean": 0.296875,
+      "rewards/accuracy_reward/std": 0.45867621898651123,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999887347221375,
+      "sampling/importance_sampling_ratio/min": 0.00013984869292471558,
+      "sampling/sampling_logp_difference/max": 8.87494945526123,
+      "sampling/sampling_logp_difference/mean": 0.01902824640274048,
+      "step": 22
+    },
+    {
+      "clip_ratio/high_max": 5.162942670722259e-06,
+      "clip_ratio/high_mean": 1.2907356676805648e-06,
+      "clip_ratio/low_mean": 3.6872071063953626e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.816280593582633e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16204.0,
+      "completions/mean_length": 7138.0390625,
+      "completions/mean_terminated_length": 6839.7822265625,
+      "completions/min_length": 729.0,
+      "completions/min_terminated_length": 729.0,
+      "entropy": 1.0403362140059471,
+      "epoch": 0.021159153633854646,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.002748022088780999,
+      "learning_rate": 1e-05,
+      "loss": 0.0647,
+      "num_tokens": 16373898.0,
+      "reward": 0.296875,
+      "reward_std": 0.3169426918029785,
+      "rewards/accuracy_reward/mean": 0.296875,
+      "rewards/accuracy_reward/std": 0.45867621898651123,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999048709869385,
+      "sampling/importance_sampling_ratio/min": 0.0003802926803473383,
+      "sampling/sampling_logp_difference/max": 7.874569416046143,
+      "sampling/sampling_logp_difference/mean": 0.020853528752923012,
+      "step": 23
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 5.6506045439164154e-05,
+      "clip_ratio/low_min": 5.709326615033206e-06,
+      "clip_ratio/region_mean": 5.6506045439164154e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14543.0,
+      "completions/mean_length": 5420.515625,
+      "completions/mean_terminated_length": 5334.18896484375,
+      "completions/min_length": 19.0,
+      "completions/min_terminated_length": 19.0,
+      "entropy": 1.1339883506298065,
+      "epoch": 0.02207911683532659,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0029502976685762405,
+      "learning_rate": 1e-05,
+      "loss": 0.0756,
+      "num_tokens": 17088156.0,
+      "reward": 0.1953125,
+      "reward_std": 0.25620076060295105,
+      "rewards/accuracy_reward/mean": 0.1953125,
+      "rewards/accuracy_reward/std": 0.3979988098144531,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999445676803589,
+      "sampling/importance_sampling_ratio/min": 9.70982582657598e-05,
+      "sampling/sampling_logp_difference/max": 9.239787101745605,
+      "sampling/sampling_logp_difference/mean": 0.0199423898011446,
+      "step": 24
+    },
+    {
+      "clip_ratio/high_max": 5.619998319161823e-06,
+      "clip_ratio/high_mean": 1.4049995797904558e-06,
+      "clip_ratio/low_mean": 6.439320418394345e-05,
+      "clip_ratio/low_min": 4.70632539872895e-06,
+      "clip_ratio/region_mean": 6.57982034226734e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14636.0,
+      "completions/mean_length": 5116.3046875,
+      "completions/mean_terminated_length": 4845.88037109375,
+      "completions/min_length": 10.0,
+      "completions/min_terminated_length": 10.0,
+      "entropy": 0.9503882825374603,
+      "epoch": 0.022999080036798528,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.004891107324510813,
+      "learning_rate": 1e-05,
+      "loss": 0.0522,
+      "num_tokens": 17766619.0,
+      "reward": 0.3203125,
+      "reward_std": 0.3366856575012207,
+      "rewards/accuracy_reward/mean": 0.3203125,
+      "rewards/accuracy_reward/std": 0.4684300124645233,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999970018863678,
+      "sampling/importance_sampling_ratio/min": 0.0010618992382660508,
+      "sampling/sampling_logp_difference/max": 6.847696304321289,
+      "sampling/sampling_logp_difference/mean": 0.01914183795452118,
+      "step": 25
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 3.839018643247982e-05,
+      "clip_ratio/low_min": 4.115091087442124e-06,
+      "clip_ratio/region_mean": 3.839018643247982e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14634.0,
+      "completions/mean_length": 5061.8671875,
+      "completions/mean_terminated_length": 4972.71630859375,
+      "completions/min_length": 281.0,
+      "completions/min_terminated_length": 281.0,
+      "entropy": 1.0540335327386856,
+      "epoch": 0.02391904323827047,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0030373274348676205,
+      "learning_rate": 1e-05,
+      "loss": 0.0246,
+      "num_tokens": 18432938.0,
+      "reward": 0.34375,
+      "reward_std": 0.28118088841438293,
+      "rewards/accuracy_reward/mean": 0.34375,
+      "rewards/accuracy_reward/std": 0.47682511806488037,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999624490737915,
+      "sampling/importance_sampling_ratio/min": 1.7212972807101323e-06,
+      "sampling/sampling_logp_difference/max": 13.272432327270508,
+      "sampling/sampling_logp_difference/mean": 0.019548218697309494,
+      "step": 26
+    },
+    {
+      "clip_ratio/high_max": 1.4656657867817557e-05,
+      "clip_ratio/high_mean": 4.665093399580655e-06,
+      "clip_ratio/low_mean": 3.751162262233265e-05,
+      "clip_ratio/low_min": 4.413062470121076e-06,
+      "clip_ratio/region_mean": 4.2176716192443564e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15782.0,
+      "completions/max_terminated_length": 15782.0,
+      "completions/mean_length": 6349.9765625,
+      "completions/mean_terminated_length": 6349.9765625,
+      "completions/min_length": 2.0,
+      "completions/min_terminated_length": 2.0,
+      "entropy": 1.0268081277608871,
+      "epoch": 0.02483900643974241,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0017623496241867542,
+      "learning_rate": 1e-05,
+      "loss": 0.0011,
+      "num_tokens": 19264743.0,
+      "reward": 0.2734375,
+      "reward_std": 0.33903974294662476,
+      "rewards/accuracy_reward/mean": 0.2734375,
+      "rewards/accuracy_reward/std": 0.447474867105484,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000061988830566,
+      "sampling/importance_sampling_ratio/min": 6.870362267363816e-05,
+      "sampling/sampling_logp_difference/max": 9.585708618164062,
+      "sampling/sampling_logp_difference/mean": 0.019106190651655197,
+      "step": 27
+    },
+    {
+      "clip_ratio/high_max": 9.221375876222737e-06,
+      "clip_ratio/high_mean": 2.3053439690556843e-06,
+      "clip_ratio/low_mean": 3.09787185415189e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.328406273794826e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15944.0,
+      "completions/mean_length": 5815.484375,
+      "completions/mean_terminated_length": 5561.84033203125,
+      "completions/min_length": 607.0,
+      "completions/min_terminated_length": 607.0,
+      "entropy": 1.0389493256807327,
+      "epoch": 0.025758969641214352,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.003111837198957801,
+      "learning_rate": 1e-05,
+      "loss": -0.0162,
+      "num_tokens": 20030109.0,
+      "reward": 0.34375,
+      "reward_std": 0.32719242572784424,
+      "rewards/accuracy_reward/mean": 0.34375,
+      "rewards/accuracy_reward/std": 0.47682511806488037,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000298023223877,
+      "sampling/importance_sampling_ratio/min": 0.02987043187022209,
+      "sampling/sampling_logp_difference/max": 3.5108861923217773,
+      "sampling/sampling_logp_difference/mean": 0.020060991868376732,
+      "step": 28
+    },
+    {
+      "clip_ratio/high_max": 6.7810142354574054e-06,
+      "clip_ratio/high_mean": 1.6952535588643514e-06,
+      "clip_ratio/low_mean": 4.474762545214617e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.644287901101052e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15371.0,
+      "completions/mean_length": 5157.1484375,
+      "completions/mean_terminated_length": 5068.748046875,
+      "completions/min_length": 11.0,
+      "completions/min_terminated_length": 11.0,
+      "entropy": 1.0510126948356628,
+      "epoch": 0.02667893284268629,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.003041633637621999,
+      "learning_rate": 1e-05,
+      "loss": 0.0471,
+      "num_tokens": 20710904.0,
+      "reward": 0.3125,
+      "reward_std": 0.35612428188323975,
+      "rewards/accuracy_reward/mean": 0.3125,
+      "rewards/accuracy_reward/std": 0.4653336703777313,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999587535858154,
+      "sampling/importance_sampling_ratio/min": 0.04357198625802994,
+      "sampling/sampling_logp_difference/max": 3.133340835571289,
+      "sampling/sampling_logp_difference/mean": 0.019007597118616104,
+      "step": 29
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 2.0962848566341563e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.0962848566341563e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15333.0,
+      "completions/max_terminated_length": 15333.0,
+      "completions/mean_length": 4446.3828125,
+      "completions/mean_terminated_length": 4446.3828125,
+      "completions/min_length": 73.0,
+      "completions/min_terminated_length": 73.0,
+      "entropy": 1.053279548883438,
+      "epoch": 0.027598896044158234,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0022369560319930315,
+      "learning_rate": 1e-05,
+      "loss": -0.001,
+      "num_tokens": 21298497.0,
+      "reward": 0.390625,
+      "reward_std": 0.24169495701789856,
+      "rewards/accuracy_reward/mean": 0.390625,
+      "rewards/accuracy_reward/std": 0.4898075461387634,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998750686645508,
+      "sampling/importance_sampling_ratio/min": 0.006704842206090689,
+      "sampling/sampling_logp_difference/max": 5.00492525100708,
+      "sampling/sampling_logp_difference/mean": 0.01947362720966339,
+      "step": 30
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 2.8460265411922592e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.8460265411922592e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15386.0,
+      "completions/mean_length": 6294.1484375,
+      "completions/mean_terminated_length": 6133.9921875,
+      "completions/min_length": 548.0,
+      "completions/min_terminated_length": 548.0,
+      "entropy": 1.2036212533712387,
+      "epoch": 0.028518859245630176,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0021383841522037983,
+      "learning_rate": 1e-05,
+      "loss": 0.033,
+      "num_tokens": 22124812.0,
+      "reward": 0.171875,
+      "reward_std": 0.20752590894699097,
+      "rewards/accuracy_reward/mean": 0.171875,
+      "rewards/accuracy_reward/std": 0.3787541687488556,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999858736991882,
+      "sampling/importance_sampling_ratio/min": 3.9575263599544996e-07,
+      "sampling/sampling_logp_difference/max": 14.742476463317871,
+      "sampling/sampling_logp_difference/mean": 0.022367021068930626,
+      "step": 31
+    },
+    {
+      "clip_ratio/high_max": 1.73864664247958e-05,
+      "clip_ratio/high_mean": 4.34661660619895e-06,
+      "clip_ratio/low_mean": 3.19569651310303e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.630358173722925e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14893.0,
+      "completions/mean_length": 6011.4921875,
+      "completions/mean_terminated_length": 5929.81884765625,
+      "completions/min_length": 2.0,
+      "completions/min_terminated_length": 2.0,
+      "entropy": 1.123318687081337,
+      "epoch": 0.029438822447102116,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.00126531848218292,
+      "learning_rate": 1e-05,
+      "loss": 0.0119,
+      "num_tokens": 22915091.0,
+      "reward": 0.171875,
+      "reward_std": 0.2330477386713028,
+      "rewards/accuracy_reward/mean": 0.171875,
+      "rewards/accuracy_reward/std": 0.3787541687488556,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999861121177673,
+      "sampling/importance_sampling_ratio/min": 1.6368276192224585e-05,
+      "sampling/sampling_logp_difference/max": 11.02016544342041,
+      "sampling/sampling_logp_difference/mean": 0.019905246794223785,
+      "step": 32
+    },
+    {
+      "clip_ratio/high_max": 2.8753217975463485e-05,
+      "clip_ratio/high_mean": 7.188304493865871e-06,
+      "clip_ratio/low_mean": 3.818478444372886e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.537308905128157e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16332.0,
+      "completions/mean_length": 5152.46875,
+      "completions/mean_terminated_length": 5064.03125,
+      "completions/min_length": 128.0,
+      "completions/min_terminated_length": 128.0,
+      "entropy": 1.0477670058608055,
+      "epoch": 0.03035878564857406,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0030069497879594564,
+      "learning_rate": 1e-05,
+      "loss": 0.1026,
+      "num_tokens": 23596487.0,
+      "reward": 0.3359375,
+      "reward_std": 0.29142576456069946,
+      "rewards/accuracy_reward/mean": 0.3359375,
+      "rewards/accuracy_reward/std": 0.47417303919792175,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999433755874634,
+      "sampling/importance_sampling_ratio/min": 9.009604013954231e-07,
+      "sampling/sampling_logp_difference/max": 13.919804573059082,
+      "sampling/sampling_logp_difference/mean": 0.019003981724381447,
+      "step": 33
+    },
+    {
+      "clip_ratio/high_max": 3.069575450354023e-05,
+      "clip_ratio/high_mean": 7.673938625885057e-06,
+      "clip_ratio/low_mean": 3.4847614415411954e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.252155258654966e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 12792.0,
+      "completions/max_terminated_length": 12792.0,
+      "completions/mean_length": 4672.5703125,
+      "completions/mean_terminated_length": 4672.5703125,
+      "completions/min_length": 4.0,
+      "completions/min_terminated_length": 4.0,
+      "entropy": 0.9471446052193642,
+      "epoch": 0.031278748850046,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.002676331205293536,
+      "learning_rate": 1e-05,
+      "loss": 0.0724,
+      "num_tokens": 24213408.0,
+      "reward": 0.3203125,
+      "reward_std": 0.2988021969795227,
+      "rewards/accuracy_reward/mean": 0.3203125,
+      "rewards/accuracy_reward/std": 0.4684300124645233,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000251531600952,
+      "sampling/importance_sampling_ratio/min": 0.0013351094676181674,
+      "sampling/sampling_logp_difference/max": 6.618741989135742,
+      "sampling/sampling_logp_difference/mean": 0.0179576613008976,
+      "step": 34
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 2.6127243245355203e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.6127243245355203e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16108.0,
+      "completions/mean_length": 7013.734375,
+      "completions/mean_terminated_length": 6711.4677734375,
+      "completions/min_length": 326.0,
+      "completions/min_terminated_length": 326.0,
+      "entropy": 1.1254516392946243,
+      "epoch": 0.03219871205151794,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0023615453392267227,
+      "learning_rate": 1e-05,
+      "loss": 0.0384,
+      "num_tokens": 25130262.0,
+      "reward": 0.1953125,
+      "reward_std": 0.26485776901245117,
+      "rewards/accuracy_reward/mean": 0.1953125,
+      "rewards/accuracy_reward/std": 0.3979988098144531,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999954342842102,
+      "sampling/importance_sampling_ratio/min": 6.6197676460433286e-06,
+      "sampling/sampling_logp_difference/max": 11.925450325012207,
+      "sampling/sampling_logp_difference/mean": 0.0215257927775383,
+      "step": 35
+    },
+    {
+      "clip_ratio/high_max": 4.06954040954588e-06,
+      "clip_ratio/high_mean": 1.01738510238647e-06,
+      "clip_ratio/low_mean": 4.180071573500754e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.281810015527299e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15673.0,
+      "completions/mean_length": 5858.59375,
+      "completions/mean_terminated_length": 5605.984375,
+      "completions/min_length": 189.0,
+      "completions/min_terminated_length": 189.0,
+      "entropy": 1.0713739022612572,
+      "epoch": 0.03311867525298988,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0029018481727689505,
+      "learning_rate": 1e-05,
+      "loss": 0.1041,
+      "num_tokens": 25898194.0,
+      "reward": 0.3671875,
+      "reward_std": 0.29036980867385864,
+      "rewards/accuracy_reward/mean": 0.3671875,
+      "rewards/accuracy_reward/std": 0.4839322865009308,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999915957450867,
+      "sampling/importance_sampling_ratio/min": 1.6834765119710937e-05,
+      "sampling/sampling_logp_difference/max": 10.992064476013184,
+      "sampling/sampling_logp_difference/mean": 0.019959844648838043,
+      "step": 36
+    },
+    {
+      "clip_ratio/high_max": 1.2810827229259303e-05,
+      "clip_ratio/high_mean": 3.2027068073148257e-06,
+      "clip_ratio/low_mean": 3.29701083501277e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.617281504375569e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14004.0,
+      "completions/mean_length": 6952.6015625,
+      "completions/mean_terminated_length": 6726.24853515625,
+      "completions/min_length": 3.0,
+      "completions/min_terminated_length": 3.0,
+      "entropy": 1.028619796037674,
+      "epoch": 0.03403863845446182,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0022342968732118607,
+      "learning_rate": 1e-05,
+      "loss": 0.0637,
+      "num_tokens": 26812791.0,
+      "reward": 0.234375,
+      "reward_std": 0.26827272772789,
+      "rewards/accuracy_reward/mean": 0.234375,
+      "rewards/accuracy_reward/std": 0.42527204751968384,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999532699584961,
+      "sampling/importance_sampling_ratio/min": 4.540153167909011e-05,
+      "sampling/sampling_logp_difference/max": 9.999964714050293,
+      "sampling/sampling_logp_difference/mean": 0.02002539485692978,
+      "step": 37
+    },
+    {
+      "clip_ratio/high_max": 1.5225089100567857e-05,
+      "clip_ratio/high_mean": 6.960676159906143e-06,
+      "clip_ratio/low_mean": 4.09088329433871e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.7869508762232726e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16361.0,
+      "completions/mean_length": 6413.421875,
+      "completions/mean_terminated_length": 6174.12841796875,
+      "completions/min_length": 2.0,
+      "completions/min_terminated_length": 2.0,
+      "entropy": 0.9452399462461472,
+      "epoch": 0.034958601655933765,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0021800603717565536,
+      "learning_rate": 1e-05,
+      "loss": 0.0275,
+      "num_tokens": 27652757.0,
+      "reward": 0.296875,
+      "reward_std": 0.31246688961982727,
+      "rewards/accuracy_reward/mean": 0.296875,
+      "rewards/accuracy_reward/std": 0.45867621898651123,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999439120292664,
+      "sampling/importance_sampling_ratio/min": 3.895394547726028e-05,
+      "sampling/sampling_logp_difference/max": 10.153130531311035,
+      "sampling/sampling_logp_difference/mean": 0.019722118973731995,
+      "step": 38
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 3.9564903318023426e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.9564903318023426e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15754.0,
+      "completions/max_terminated_length": 15754.0,
+      "completions/mean_length": 5176.3515625,
+      "completions/mean_terminated_length": 5176.3515625,
+      "completions/min_length": 130.0,
+      "completions/min_terminated_length": 130.0,
+      "entropy": 1.0444758981466293,
+      "epoch": 0.035878564857405704,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.004153470974415541,
+      "learning_rate": 1e-05,
+      "loss": 0.0798,
+      "num_tokens": 28334386.0,
+      "reward": 0.2734375,
+      "reward_std": 0.32035762071609497,
+      "rewards/accuracy_reward/mean": 0.2734375,
+      "rewards/accuracy_reward/std": 0.447474867105484,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999774694442749,
+      "sampling/importance_sampling_ratio/min": 0.007421077694743872,
+      "sampling/sampling_logp_difference/max": 4.903430938720703,
+      "sampling/sampling_logp_difference/mean": 0.020159056410193443,
+      "step": 39
+    },
+    {
+      "clip_ratio/high_max": 1.725743459246587e-05,
+      "clip_ratio/high_mean": 4.3143586481164675e-06,
+      "clip_ratio/low_mean": 2.0204584302518924e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.451894306432223e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15554.0,
+      "completions/mean_length": 5178.9921875,
+      "completions/mean_terminated_length": 5001.13525390625,
+      "completions/min_length": 4.0,
+      "completions/min_terminated_length": 4.0,
+      "entropy": 1.0803537145256996,
+      "epoch": 0.03679852805887764,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.002477057045325637,
+      "learning_rate": 1e-05,
+      "loss": 0.0067,
+      "num_tokens": 29017145.0,
+      "reward": 0.2890625,
+      "reward_std": 0.29932135343551636,
+      "rewards/accuracy_reward/mean": 0.2890625,
+      "rewards/accuracy_reward/std": 0.45510825514793396,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000497102737427,
+      "sampling/importance_sampling_ratio/min": 0.004630985204130411,
+      "sampling/sampling_logp_difference/max": 5.374985694885254,
+      "sampling/sampling_logp_difference/mean": 0.019826076924800873,
+      "step": 40
+    },
+    {
+      "clip_ratio/high_max": 1.6637992303003557e-05,
+      "clip_ratio/high_mean": 4.159498075750889e-06,
+      "clip_ratio/low_mean": 2.1970684144889674e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.6130182106953725e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 14131.0,
+      "completions/max_terminated_length": 14131.0,
+      "completions/mean_length": 4980.359375,
+      "completions/mean_terminated_length": 4980.359375,
+      "completions/min_length": 329.0,
+      "completions/min_terminated_length": 329.0,
+      "entropy": 0.9510642662644386,
+      "epoch": 0.03771849126034959,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0016275218222290277,
+      "learning_rate": 1e-05,
+      "loss": -0.0097,
+      "num_tokens": 29673535.0,
+      "reward": 0.4375,
+      "reward_std": 0.26249876618385315,
+      "rewards/accuracy_reward/mean": 0.4375,
+      "rewards/accuracy_reward/std": 0.49802759289741516,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999750852584839,
+      "sampling/importance_sampling_ratio/min": 0.000599516904912889,
+      "sampling/sampling_logp_difference/max": 7.419386386871338,
+      "sampling/sampling_logp_difference/mean": 0.01844976656138897,
+      "step": 41
+    },
+    {
+      "clip_ratio/high_max": 2.8087193186365766e-05,
+      "clip_ratio/high_mean": 7.021798296591442e-06,
+      "clip_ratio/low_mean": 3.9683913541921356e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.670571286169434e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15328.0,
+      "completions/mean_length": 5778.6953125,
+      "completions/mean_terminated_length": 5695.18896484375,
+      "completions/min_length": 691.0,
+      "completions/min_terminated_length": 691.0,
+      "entropy": 1.0413239300251007,
+      "epoch": 0.03863845446182153,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.001847646082751453,
+      "learning_rate": 1e-05,
+      "loss": -0.0045,
+      "num_tokens": 30436416.0,
+      "reward": 0.2578125,
+      "reward_std": 0.33903977274894714,
+      "rewards/accuracy_reward/mean": 0.2578125,
+      "rewards/accuracy_reward/std": 0.43914902210235596,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998501539230347,
+      "sampling/importance_sampling_ratio/min": 0.00020348970429040492,
+      "sampling/sampling_logp_difference/max": 8.499895095825195,
+      "sampling/sampling_logp_difference/mean": 0.021502099931240082,
+      "step": 42
+    },
+    {
+      "clip_ratio/high_max": 2.68402091023745e-05,
+      "clip_ratio/high_mean": 8.575278570788214e-06,
+      "clip_ratio/low_mean": 4.547183698377921e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.404711600931478e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 14182.0,
+      "completions/max_terminated_length": 14182.0,
+      "completions/mean_length": 4875.125,
+      "completions/mean_terminated_length": 4875.125,
+      "completions/min_length": 349.0,
+      "completions/min_terminated_length": 349.0,
+      "entropy": 1.0464690178632736,
+      "epoch": 0.03955841766329347,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0021134833805263042,
+      "learning_rate": 1e-05,
+      "loss": 0.0727,
+      "num_tokens": 31083672.0,
+      "reward": 0.40625,
+      "reward_std": 0.3584783971309662,
+      "rewards/accuracy_reward/mean": 0.40625,
+      "rewards/accuracy_reward/std": 0.4930621087551117,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999340176582336,
+      "sampling/importance_sampling_ratio/min": 0.012113225646317005,
+      "sampling/sampling_logp_difference/max": 4.41345739364624,
+      "sampling/sampling_logp_difference/mean": 0.019140049815177917,
+      "step": 43
+    },
+    {
+      "clip_ratio/high_max": 3.9877967992651975e-05,
+      "clip_ratio/high_mean": 9.969491998162994e-06,
+      "clip_ratio/low_mean": 3.981287841270387e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.9782369273998484e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15959.0,
+      "completions/mean_length": 4691.421875,
+      "completions/mean_terminated_length": 4505.82568359375,
+      "completions/min_length": 296.0,
+      "completions/min_terminated_length": 296.0,
+      "entropy": 1.0229775309562683,
+      "epoch": 0.040478380864765406,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0037735572550445795,
+      "learning_rate": 1e-05,
+      "loss": 0.0603,
+      "num_tokens": 31703654.0,
+      "reward": 0.4453125,
+      "reward_std": 0.2993389964103699,
+      "rewards/accuracy_reward/mean": 0.4453125,
+      "rewards/accuracy_reward/std": 0.4989531338214874,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999492168426514,
+      "sampling/importance_sampling_ratio/min": 0.03150063753128052,
+      "sampling/sampling_logp_difference/max": 3.457747459411621,
+      "sampling/sampling_logp_difference/mean": 0.01912039890885353,
+      "step": 44
+    },
+    {
+      "clip_ratio/high_max": 3.5441889849607833e-06,
+      "clip_ratio/high_mean": 8.860472462401958e-07,
+      "clip_ratio/low_mean": 1.5137359810069029e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 1.6023407056309225e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15605.0,
+      "completions/mean_length": 6821.96875,
+      "completions/mean_terminated_length": 6592.48046875,
+      "completions/min_length": 1196.0,
+      "completions/min_terminated_length": 1196.0,
+      "entropy": 1.1132484003901482,
+      "epoch": 0.04139834406623735,
+      "frac_reward_zero_std": 0.625,
+      "grad_norm": 0.0010448681423440576,
+      "learning_rate": 1e-05,
+      "loss": 0.022,
+      "num_tokens": 32599778.0,
+      "reward": 0.2265625,
+      "reward_std": 0.1814819872379303,
+      "rewards/accuracy_reward/mean": 0.2265625,
+      "rewards/accuracy_reward/std": 0.4202519655227661,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999915361404419,
+      "sampling/importance_sampling_ratio/min": 0.006500681862235069,
+      "sampling/sampling_logp_difference/max": 5.035848140716553,
+      "sampling/sampling_logp_difference/mean": 0.02125459350645542,
+      "step": 45
+    },
+    {
+      "clip_ratio/high_max": 4.652893949241843e-06,
+      "clip_ratio/high_mean": 1.1632234873104608e-06,
+      "clip_ratio/low_mean": 5.731516603191267e-05,
+      "clip_ratio/low_min": 9.891066838463303e-06,
+      "clip_ratio/region_mean": 5.8478389746596804e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15753.0,
+      "completions/mean_length": 6834.3671875,
+      "completions/mean_terminated_length": 6605.17626953125,
+      "completions/min_length": 624.0,
+      "completions/min_terminated_length": 624.0,
+      "entropy": 0.9827468693256378,
+      "epoch": 0.04231830726770929,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.0017670176457613707,
+      "learning_rate": 1e-05,
+      "loss": 0.1105,
+      "num_tokens": 33492737.0,
+      "reward": 0.3046875,
+      "reward_std": 0.3440523147583008,
+      "rewards/accuracy_reward/mean": 0.3046875,
+      "rewards/accuracy_reward/std": 0.46208351850509644,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999089241027832,
+      "sampling/importance_sampling_ratio/min": 0.0021202093921601772,
+      "sampling/sampling_logp_difference/max": 6.156240463256836,
+      "sampling/sampling_logp_difference/mean": 0.019490526989102364,
+      "step": 46
+    },
+    {
+      "clip_ratio/high_max": 6.717360520269722e-06,
+      "clip_ratio/high_mean": 2.503530367903295e-06,
+      "clip_ratio/low_mean": 2.5672919832686603e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.8176450200589898e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14098.0,
+      "completions/mean_length": 6175.296875,
+      "completions/mean_terminated_length": 5845.98388671875,
+      "completions/min_length": 558.0,
+      "completions/min_terminated_length": 558.0,
+      "entropy": 1.1584237962961197,
+      "epoch": 0.04323827046918123,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.0016891945851966739,
+      "learning_rate": 1e-05,
+      "loss": -0.0008,
+      "num_tokens": 34312455.0,
+      "reward": 0.1875,
+      "reward_std": 0.19673937559127808,
+      "rewards/accuracy_reward/mean": 0.1875,
+      "rewards/accuracy_reward/std": 0.39184603095054626,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999643564224243,
+      "sampling/importance_sampling_ratio/min": 8.086384332273155e-05,
+      "sampling/sampling_logp_difference/max": 9.422743797302246,
+      "sampling/sampling_logp_difference/mean": 0.021749887615442276,
+      "step": 47
+    },
+    {
+      "clip_ratio/high_max": 2.2362002255249536e-05,
+      "clip_ratio/high_mean": 8.189798336388776e-06,
+      "clip_ratio/low_mean": 2.1058204993096297e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.9248002192616696e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16054.0,
+      "completions/mean_length": 6036.8359375,
+      "completions/mean_terminated_length": 5955.3623046875,
+      "completions/min_length": 510.0,
+      "completions/min_terminated_length": 510.0,
+      "entropy": 0.9301538467407227,
+      "epoch": 0.04415823367065318,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.003834392176941037,
+      "learning_rate": 1e-05,
+      "loss": 0.0636,
+      "num_tokens": 35102738.0,
+      "reward": 0.4375,
+      "reward_std": 0.36614155769348145,
+      "rewards/accuracy_reward/mean": 0.4375,
+      "rewards/accuracy_reward/std": 0.49802759289741516,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998494386672974,
+      "sampling/importance_sampling_ratio/min": 0.00013992394087836146,
+      "sampling/sampling_logp_difference/max": 8.874411582946777,
+      "sampling/sampling_logp_difference/mean": 0.019147861748933792,
+      "step": 48
+    },
+    {
+      "clip_ratio/high_max": 1.1501961580506759e-05,
+      "clip_ratio/high_mean": 2.8754903951266897e-06,
+      "clip_ratio/low_mean": 4.08189714562468e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.369446196506033e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15594.0,
+      "completions/mean_length": 6262.46875,
+      "completions/mean_terminated_length": 5764.68798828125,
+      "completions/min_length": 210.0,
+      "completions/min_terminated_length": 210.0,
+      "entropy": 0.8599015846848488,
+      "epoch": 0.045078196872125116,
+      "frac_reward_zero_std": 0.125,
+      "grad_norm": 0.0029804729856550694,
+      "learning_rate": 1e-05,
+      "loss": 0.0495,
+      "num_tokens": 35924886.0,
+      "reward": 0.3984375,
+      "reward_std": 0.3911295533180237,
+      "rewards/accuracy_reward/mean": 0.3984375,
+      "rewards/accuracy_reward/std": 0.4915000796318054,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999922513961792,
+      "sampling/importance_sampling_ratio/min": 0.00021375219512265176,
+      "sampling/sampling_logp_difference/max": 9.904524803161621,
+      "sampling/sampling_logp_difference/mean": 0.01815103553235531,
+      "step": 49
+    },
+    {
+      "clip_ratio/high_max": 2.4107544049911667e-05,
+      "clip_ratio/high_mean": 6.026886012477917e-06,
+      "clip_ratio/low_mean": 3.6588148361715866e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.261503391944643e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 14556.0,
+      "completions/max_terminated_length": 14556.0,
+      "completions/mean_length": 5926.8984375,
+      "completions/mean_terminated_length": 5926.8984375,
+      "completions/min_length": 346.0,
+      "completions/min_terminated_length": 346.0,
+      "entropy": 1.0042993426322937,
+      "epoch": 0.045998160073597055,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0022071697749197483,
+      "learning_rate": 1e-05,
+      "loss": 0.0059,
+      "num_tokens": 36700913.0,
+      "reward": 0.3359375,
+      "reward_std": 0.3306073546409607,
+      "rewards/accuracy_reward/mean": 0.3359375,
+      "rewards/accuracy_reward/std": 0.47417303919792175,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.000010371208191,
+      "sampling/importance_sampling_ratio/min": 0.0005220364546403289,
+      "sampling/sampling_logp_difference/max": 7.557773113250732,
+      "sampling/sampling_logp_difference/mean": 0.01954064890742302,
+      "step": 50
+    },
+    {
+      "clip_ratio/high_max": 4.9106265578302555e-06,
+      "clip_ratio/high_mean": 1.2276566394575639e-06,
+      "clip_ratio/low_mean": 2.634599570683349e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.7573652346291055e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15217.0,
+      "completions/mean_length": 6873.6875,
+      "completions/mean_terminated_length": 6645.4404296875,
+      "completions/min_length": 505.0,
+      "completions/min_terminated_length": 505.0,
+      "entropy": 1.0255412608385086,
+      "epoch": 0.046918123275068994,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.002320924773812294,
+      "learning_rate": 1e-05,
+      "loss": 0.0508,
+      "num_tokens": 37604865.0,
+      "reward": 0.234375,
+      "reward_std": 0.3135228157043457,
+      "rewards/accuracy_reward/mean": 0.234375,
+      "rewards/accuracy_reward/std": 0.42527204751968384,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999098777770996,
+      "sampling/importance_sampling_ratio/min": 0.026153141632676125,
+      "sampling/sampling_logp_difference/max": 3.6437859535217285,
+      "sampling/sampling_logp_difference/mean": 0.019532475620508194,
+      "step": 51
+    },
+    {
+      "clip_ratio/high_max": 1.6350510122720152e-05,
+      "clip_ratio/high_mean": 4.087627530680038e-06,
+      "clip_ratio/low_mean": 2.351988746340794e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.7607515221461654e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15668.0,
+      "completions/mean_length": 6073.8984375,
+      "completions/mean_terminated_length": 5992.71630859375,
+      "completions/min_length": 654.0,
+      "completions/min_terminated_length": 654.0,
+      "entropy": 1.0713753998279572,
+      "epoch": 0.04783808647654094,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.002212709980085492,
+      "learning_rate": 1e-05,
+      "loss": 0.0668,
+      "num_tokens": 38405196.0,
+      "reward": 0.359375,
+      "reward_std": 0.22119548916816711,
+      "rewards/accuracy_reward/mean": 0.359375,
+      "rewards/accuracy_reward/std": 0.481702595949173,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998978972434998,
+      "sampling/importance_sampling_ratio/min": 8.706459084351081e-06,
+      "sampling/sampling_logp_difference/max": 11.651445388793945,
+      "sampling/sampling_logp_difference/mean": 0.021252838894724846,
+      "step": 52
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 3.729486718384578e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.729486718384578e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15299.0,
+      "completions/mean_length": 5838.71875,
+      "completions/mean_terminated_length": 5671.33349609375,
+      "completions/min_length": 331.0,
+      "completions/min_terminated_length": 331.0,
+      "entropy": 1.021155133843422,
+      "epoch": 0.04875804967801288,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.001135052996687591,
+      "learning_rate": 1e-05,
+      "loss": 0.0178,
+      "num_tokens": 39171704.0,
+      "reward": 0.28125,
+      "reward_std": 0.23410367965698242,
+      "rewards/accuracy_reward/mean": 0.28125,
+      "rewards/accuracy_reward/std": 0.4513758420944214,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999173879623413,
+      "sampling/importance_sampling_ratio/min": 0.003084881929680705,
+      "sampling/sampling_logp_difference/max": 5.7812418937683105,
+      "sampling/sampling_logp_difference/mean": 0.020781882107257843,
+      "step": 53
+    },
+    {
+      "clip_ratio/high_max": 1.7124169744420215e-05,
+      "clip_ratio/high_mean": 4.281042436105054e-06,
+      "clip_ratio/low_mean": 3.706903294187214e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.135007543482061e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 14617.0,
+      "completions/max_terminated_length": 14617.0,
+      "completions/mean_length": 6358.5859375,
+      "completions/mean_terminated_length": 6358.5859375,
+      "completions/min_length": 940.0,
+      "completions/min_terminated_length": 940.0,
+      "entropy": 0.9720487147569656,
+      "epoch": 0.04967801287948482,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.002638082252815366,
+      "learning_rate": 1e-05,
+      "loss": 0.0145,
+      "num_tokens": 40003859.0,
+      "reward": 0.40625,
+      "reward_std": 0.3174618184566498,
+      "rewards/accuracy_reward/mean": 0.40625,
+      "rewards/accuracy_reward/std": 0.4930621087551117,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000380277633667,
+      "sampling/importance_sampling_ratio/min": 0.01960253342986107,
+      "sampling/sampling_logp_difference/max": 3.932096481323242,
+      "sampling/sampling_logp_difference/mean": 0.01991666667163372,
+      "step": 54
+    },
+    {
+      "clip_ratio/high_max": 6.55582925901399e-06,
+      "clip_ratio/high_mean": 2.994117721755174e-06,
+      "clip_ratio/low_mean": 2.222621503733535e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.5220332759090525e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 14753.0,
+      "completions/max_terminated_length": 14753.0,
+      "completions/mean_length": 4634.1875,
+      "completions/mean_terminated_length": 4634.1875,
+      "completions/min_length": 266.0,
+      "completions/min_terminated_length": 266.0,
+      "entropy": 0.9715309366583824,
+      "epoch": 0.050597976080956765,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.001994960242882371,
+      "learning_rate": 1e-05,
+      "loss": 0.0221,
+      "num_tokens": 40616483.0,
+      "reward": 0.4375,
+      "reward_std": 0.29644322395324707,
+      "rewards/accuracy_reward/mean": 0.4375,
+      "rewards/accuracy_reward/std": 0.49802759289741516,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000698566436768,
+      "sampling/importance_sampling_ratio/min": 1.0510009815334342e-05,
+      "sampling/sampling_logp_difference/max": 11.46318244934082,
+      "sampling/sampling_logp_difference/mean": 0.01902047172188759,
+      "step": 55
+    },
+    {
+      "clip_ratio/high_max": 2.2474248908110894e-05,
+      "clip_ratio/high_mean": 7.571314540655294e-06,
+      "clip_ratio/low_mean": 4.3583780325207044e-05,
+      "clip_ratio/low_min": 4.6013396968191955e-06,
+      "clip_ratio/region_mean": 5.1155094070054474e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15953.0,
+      "completions/mean_length": 6596.25,
+      "completions/mean_terminated_length": 6361.34423828125,
+      "completions/min_length": 318.0,
+      "completions/min_terminated_length": 318.0,
+      "entropy": 0.8207943215966225,
+      "epoch": 0.051517939282428704,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0019902780186384916,
+      "learning_rate": 1e-05,
+      "loss": 0.0506,
+      "num_tokens": 41484443.0,
+      "reward": 0.4453125,
+      "reward_std": 0.326668381690979,
+      "rewards/accuracy_reward/mean": 0.4453125,
+      "rewards/accuracy_reward/std": 0.4989531338214874,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.000016689300537,
+      "sampling/importance_sampling_ratio/min": 7.485233072657138e-05,
+      "sampling/sampling_logp_difference/max": 9.499993324279785,
+      "sampling/sampling_logp_difference/mean": 0.018301833420991898,
+      "step": 56
+    },
+    {
+      "clip_ratio/high_max": 3.0019932637515012e-06,
+      "clip_ratio/high_mean": 7.504983159378753e-07,
+      "clip_ratio/low_mean": 4.332785601945943e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.407835376696312e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15834.0,
+      "completions/mean_length": 6785.75,
+      "completions/mean_terminated_length": 6313.70458984375,
+      "completions/min_length": 393.0,
+      "completions/min_terminated_length": 393.0,
+      "entropy": 0.9876058474183083,
+      "epoch": 0.05243790248390064,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0015235114842653275,
+      "learning_rate": 1e-05,
+      "loss": 0.0128,
+      "num_tokens": 42372235.0,
+      "reward": 0.2421875,
+      "reward_std": 0.325075626373291,
+      "rewards/accuracy_reward/mean": 0.2421875,
+      "rewards/accuracy_reward/std": 0.4300905168056488,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999551773071289,
+      "sampling/importance_sampling_ratio/min": 0.026679370552301407,
+      "sampling/sampling_logp_difference/max": 3.6238646507263184,
+      "sampling/sampling_logp_difference/mean": 0.019945615902543068,
+      "step": 57
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 2.1349006601667497e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.1349006601667497e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14726.0,
+      "completions/mean_length": 4881.2109375,
+      "completions/mean_terminated_length": 4510.1533203125,
+      "completions/min_length": 437.0,
+      "completions/min_terminated_length": 437.0,
+      "entropy": 0.989942155778408,
+      "epoch": 0.05335786568537258,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.002033712575212121,
+      "learning_rate": 1e-05,
+      "loss": 0.1088,
+      "num_tokens": 43015238.0,
+      "reward": 0.4375,
+      "reward_std": 0.2869548797607422,
+      "rewards/accuracy_reward/mean": 0.4375,
+      "rewards/accuracy_reward/std": 0.49802759289741516,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000300407409668,
+      "sampling/importance_sampling_ratio/min": 0.0001238943514181301,
+      "sampling/sampling_logp_difference/max": 8.996081352233887,
+      "sampling/sampling_logp_difference/mean": 0.01887543685734272,
+      "step": 58
+    },
+    {
+      "clip_ratio/high_max": 2.584004687378183e-05,
+      "clip_ratio/high_mean": 6.4600117184454575e-06,
+      "clip_ratio/low_mean": 2.1371045761497953e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.7831058105221018e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15001.0,
+      "completions/max_terminated_length": 15001.0,
+      "completions/mean_length": 4725.3984375,
+      "completions/mean_terminated_length": 4725.3984375,
+      "completions/min_length": 117.0,
+      "completions/min_terminated_length": 117.0,
+      "entropy": 1.0350637435913086,
+      "epoch": 0.05427782888684453,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0030296226032078266,
+      "learning_rate": 1e-05,
+      "loss": 0.0691,
+      "num_tokens": 43637737.0,
+      "reward": 0.4453125,
+      "reward_std": 0.32035762071609497,
+      "rewards/accuracy_reward/mean": 0.4453125,
+      "rewards/accuracy_reward/std": 0.4989531338214874,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999939203262329,
+      "sampling/importance_sampling_ratio/min": 0.00022932067804504186,
+      "sampling/sampling_logp_difference/max": 8.380389213562012,
+      "sampling/sampling_logp_difference/mean": 0.01995944231748581,
+      "step": 59
+    },
+    {
+      "clip_ratio/high_max": 1.994733975152485e-05,
+      "clip_ratio/high_mean": 4.986834937881213e-06,
+      "clip_ratio/low_mean": 3.5168303838872816e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.015513832200668e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16240.0,
+      "completions/mean_length": 4918.171875,
+      "completions/mean_terminated_length": 4736.1748046875,
+      "completions/min_length": 388.0,
+      "completions/min_terminated_length": 388.0,
+      "entropy": 0.965274304151535,
+      "epoch": 0.05519779208831647,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.002758471528068185,
+      "learning_rate": 1e-05,
+      "loss": 0.0845,
+      "num_tokens": 44285327.0,
+      "reward": 0.328125,
+      "reward_std": 0.27328526973724365,
+      "rewards/accuracy_reward/mean": 0.328125,
+      "rewards/accuracy_reward/std": 0.4713755249977112,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999663233757019,
+      "sampling/importance_sampling_ratio/min": 0.010958661325275898,
+      "sampling/sampling_logp_difference/max": 4.513625144958496,
+      "sampling/sampling_logp_difference/mean": 0.019083233550190926,
+      "step": 60
+    },
+    {
+      "clip_ratio/high_max": 1.0621563887980301e-05,
+      "clip_ratio/high_mean": 2.6553909719950752e-06,
+      "clip_ratio/low_mean": 3.838553107016196e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.1040922042157035e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15031.0,
+      "completions/mean_length": 4998.2890625,
+      "completions/mean_terminated_length": 4908.6376953125,
+      "completions/min_length": 524.0,
+      "completions/min_terminated_length": 524.0,
+      "entropy": 0.9200445115566254,
+      "epoch": 0.05611775528978841,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.0027611786499619484,
+      "learning_rate": 1e-05,
+      "loss": 0.0575,
+      "num_tokens": 44944356.0,
+      "reward": 0.3515625,
+      "reward_std": 0.3895368278026581,
+      "rewards/accuracy_reward/mean": 0.3515625,
+      "rewards/accuracy_reward/std": 0.4793342351913452,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999884366989136,
+      "sampling/importance_sampling_ratio/min": 0.0018651526188477874,
+      "sampling/sampling_logp_difference/max": 6.284412384033203,
+      "sampling/sampling_logp_difference/mean": 0.017853498458862305,
+      "step": 61
+    },
+    {
+      "clip_ratio/high_max": 1.0136624496226432e-05,
+      "clip_ratio/high_mean": 2.534156124056608e-06,
+      "clip_ratio/low_mean": 2.0260404085092887e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.2794560095462657e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16110.0,
+      "completions/mean_length": 6290.1796875,
+      "completions/mean_terminated_length": 6129.96044921875,
+      "completions/min_length": 302.0,
+      "completions/min_terminated_length": 302.0,
+      "entropy": 0.9360214695334435,
+      "epoch": 0.05703771849126035,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0015557854203507304,
+      "learning_rate": 1e-05,
+      "loss": 0.0111,
+      "num_tokens": 45767867.0,
+      "reward": 0.34375,
+      "reward_std": 0.30168038606643677,
+      "rewards/accuracy_reward/mean": 0.34375,
+      "rewards/accuracy_reward/std": 0.47682511806488037,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999427795410156,
+      "sampling/importance_sampling_ratio/min": 0.0011004531988874078,
+      "sampling/sampling_logp_difference/max": 6.812033176422119,
+      "sampling/sampling_logp_difference/mean": 0.0200855303555727,
+      "step": 62
+    },
+    {
+      "clip_ratio/high_max": 2.2559511307918e-06,
+      "clip_ratio/high_mean": 5.6398778269795e-07,
+      "clip_ratio/low_mean": 4.51761221711422e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.574010984015331e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16366.0,
+      "completions/mean_length": 6486.15625,
+      "completions/mean_terminated_length": 6248.6083984375,
+      "completions/min_length": 77.0,
+      "completions/min_terminated_length": 77.0,
+      "entropy": 0.863138921558857,
+      "epoch": 0.05795768169273229,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0026953541673719883,
+      "learning_rate": 1e-05,
+      "loss": -0.0194,
+      "num_tokens": 46618575.0,
+      "reward": 0.2578125,
+      "reward_std": 0.2580180764198303,
+      "rewards/accuracy_reward/mean": 0.2578125,
+      "rewards/accuracy_reward/std": 0.43914902210235596,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999406337738037,
+      "sampling/importance_sampling_ratio/min": 0.0011708897072821856,
+      "sampling/sampling_logp_difference/max": 6.749991416931152,
+      "sampling/sampling_logp_difference/mean": 0.01863238587975502,
+      "step": 63
+    },
+    {
+      "clip_ratio/high_max": 1.0073357771034352e-05,
+      "clip_ratio/high_mean": 2.518339442758588e-06,
+      "clip_ratio/low_mean": 2.787370635815023e-05,
+      "clip_ratio/low_min": 3.837534222839167e-06,
+      "clip_ratio/region_mean": 3.0392045573535142e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16010.0,
+      "completions/mean_length": 6442.7734375,
+      "completions/mean_terminated_length": 6284.9765625,
+      "completions/min_length": 776.0,
+      "completions/min_terminated_length": 776.0,
+      "entropy": 1.0242054909467697,
+      "epoch": 0.05887764489420423,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0024442619178444147,
+      "learning_rate": 1e-05,
+      "loss": 0.0569,
+      "num_tokens": 47462274.0,
+      "reward": 0.328125,
+      "reward_std": 0.2777610421180725,
+      "rewards/accuracy_reward/mean": 0.328125,
+      "rewards/accuracy_reward/std": 0.4713755249977112,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998892545700073,
+      "sampling/importance_sampling_ratio/min": 4.9445447736218284e-09,
+      "sampling/sampling_logp_difference/max": 19.124980926513672,
+      "sampling/sampling_logp_difference/mean": 0.019810764119029045,
+      "step": 64
+    },
+    {
+      "clip_ratio/high_max": 1.220810372615233e-05,
+      "clip_ratio/high_mean": 3.0520259315380827e-06,
+      "clip_ratio/low_mean": 4.339240456374682e-05,
+      "clip_ratio/low_min": 4.491233084991109e-06,
+      "clip_ratio/region_mean": 4.644443038159807e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15624.0,
+      "completions/mean_length": 4807.765625,
+      "completions/mean_terminated_length": 4716.6142578125,
+      "completions/min_length": 272.0,
+      "completions/min_terminated_length": 272.0,
+      "entropy": 1.045751042664051,
+      "epoch": 0.05979760809567617,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.002512057079002261,
+      "learning_rate": 1e-05,
+      "loss": 0.003,
+      "num_tokens": 48096692.0,
+      "reward": 0.3671875,
+      "reward_std": 0.3435155153274536,
+      "rewards/accuracy_reward/mean": 0.3671875,
+      "rewards/accuracy_reward/std": 0.4839322865009308,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999058842658997,
+      "sampling/importance_sampling_ratio/min": 1.1480136890895665e-05,
+      "sampling/sampling_logp_difference/max": 11.374892234802246,
+      "sampling/sampling_logp_difference/mean": 0.01960371434688568,
+      "step": 65
+    },
+    {
+      "clip_ratio/high_max": 5.37941218681226e-06,
+      "clip_ratio/high_mean": 1.344853046703065e-06,
+      "clip_ratio/low_mean": 3.0161771633174794e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.1506624850408116e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16264.0,
+      "completions/mean_length": 6703.8359375,
+      "completions/mean_terminated_length": 6471.51220703125,
+      "completions/min_length": 813.0,
+      "completions/min_terminated_length": 813.0,
+      "entropy": 1.0592866837978363,
+      "epoch": 0.06071757129714812,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0016389708034694195,
+      "learning_rate": 1e-05,
+      "loss": -0.024,
+      "num_tokens": 48974399.0,
+      "reward": 0.2734375,
+      "reward_std": 0.2585548758506775,
+      "rewards/accuracy_reward/mean": 0.2734375,
+      "rewards/accuracy_reward/std": 0.447474867105484,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999353885650635,
+      "sampling/importance_sampling_ratio/min": 7.4113349910476245e-06,
+      "sampling/sampling_logp_difference/max": 11.8125,
+      "sampling/sampling_logp_difference/mean": 0.020880095660686493,
+      "step": 66
+    },
+    {
+      "clip_ratio/high_max": 7.093600515872822e-06,
+      "clip_ratio/high_mean": 1.7734001289682055e-06,
+      "clip_ratio/low_mean": 4.470584758564655e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.647924811251869e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16295.0,
+      "completions/mean_length": 6140.5078125,
+      "completions/mean_terminated_length": 5724.10546875,
+      "completions/min_length": 462.0,
+      "completions/min_terminated_length": 462.0,
+      "entropy": 1.0998501181602478,
+      "epoch": 0.061637534498620056,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.003946912474930286,
+      "learning_rate": 1e-05,
+      "loss": 0.0448,
+      "num_tokens": 49779920.0,
+      "reward": 0.34375,
+      "reward_std": 0.36796674132347107,
+      "rewards/accuracy_reward/mean": 0.34375,
+      "rewards/accuracy_reward/std": 0.47682511806488037,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999687671661377,
+      "sampling/importance_sampling_ratio/min": 2.849436668839189e-07,
+      "sampling/sampling_logp_difference/max": 15.070974349975586,
+      "sampling/sampling_logp_difference/mean": 0.021355850622057915,
+      "step": 67
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 3.313956779038563e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.313956779038563e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16352.0,
+      "completions/mean_length": 6689.8046875,
+      "completions/mean_terminated_length": 6213.04052734375,
+      "completions/min_length": 422.0,
+      "completions/min_terminated_length": 422.0,
+      "entropy": 0.8561654165387154,
+      "epoch": 0.062557497700092,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.0021656695753335953,
+      "learning_rate": 1e-05,
+      "loss": 0.0283,
+      "num_tokens": 50655023.0,
+      "reward": 0.203125,
+      "reward_std": 0.21723884344100952,
+      "rewards/accuracy_reward/mean": 0.203125,
+      "rewards/accuracy_reward/std": 0.40390563011169434,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999941885471344,
+      "sampling/importance_sampling_ratio/min": 2.836359499269747e-06,
+      "sampling/sampling_logp_difference/max": 12.772989273071289,
+      "sampling/sampling_logp_difference/mean": 0.01873670145869255,
+      "step": 68
+    },
+    {
+      "clip_ratio/high_max": 2.3421607693308033e-05,
+      "clip_ratio/high_mean": 7.242933975248889e-06,
+      "clip_ratio/low_mean": 3.896083626386826e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.620377103492501e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 14330.0,
+      "completions/max_terminated_length": 14330.0,
+      "completions/mean_length": 5707.0078125,
+      "completions/mean_terminated_length": 5707.0078125,
+      "completions/min_length": 625.0,
+      "completions/min_terminated_length": 625.0,
+      "entropy": 1.1396166533231735,
+      "epoch": 0.06347746090156393,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.004121148493140936,
+      "learning_rate": 1e-05,
+      "loss": 0.0397,
+      "num_tokens": 51406536.0,
+      "reward": 0.3125,
+      "reward_std": 0.3066929578781128,
+      "rewards/accuracy_reward/mean": 0.3125,
+      "rewards/accuracy_reward/std": 0.4653336703777313,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999328851699829,
+      "sampling/importance_sampling_ratio/min": 0.0005196487763896585,
+      "sampling/sampling_logp_difference/max": 7.562357425689697,
+      "sampling/sampling_logp_difference/mean": 0.020000409334897995,
+      "step": 69
+    },
+    {
+      "clip_ratio/high_max": 1.82290532393381e-05,
+      "clip_ratio/high_mean": 4.557263309834525e-06,
+      "clip_ratio/low_mean": 2.5275351731579576e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.9832615496161452e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15987.0,
+      "completions/mean_length": 5655.6328125,
+      "completions/mean_terminated_length": 5571.1572265625,
+      "completions/min_length": 157.0,
+      "completions/min_terminated_length": 157.0,
+      "entropy": 0.8928132206201553,
+      "epoch": 0.06439742410303588,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0032538517843931913,
+      "learning_rate": 1e-05,
+      "loss": 0.0627,
+      "num_tokens": 52148473.0,
+      "reward": 0.3984375,
+      "reward_std": 0.29432642459869385,
+      "rewards/accuracy_reward/mean": 0.3984375,
+      "rewards/accuracy_reward/std": 0.4915000796318054,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000033378601074,
+      "sampling/importance_sampling_ratio/min": 0.0017573959194123745,
+      "sampling/sampling_logp_difference/max": 6.343922138214111,
+      "sampling/sampling_logp_difference/mean": 0.018881790339946747,
+      "step": 70
+    },
+    {
+      "clip_ratio/high_max": 1.2836022506235167e-05,
+      "clip_ratio/high_mean": 3.209005626558792e-06,
+      "clip_ratio/low_mean": 3.8109637216621195e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.131864307055366e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16323.0,
+      "completions/mean_length": 7399.7890625,
+      "completions/mean_terminated_length": 7034.5771484375,
+      "completions/min_length": 144.0,
+      "completions/min_terminated_length": 144.0,
+      "entropy": 0.8808257132768631,
+      "epoch": 0.06531738730450783,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.002061733277514577,
+      "learning_rate": 1e-05,
+      "loss": 0.0191,
+      "num_tokens": 53113230.0,
+      "reward": 0.3046875,
+      "reward_std": 0.2580180764198303,
+      "rewards/accuracy_reward/mean": 0.3046875,
+      "rewards/accuracy_reward/std": 0.46208351850509644,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999673962593079,
+      "sampling/importance_sampling_ratio/min": 0.005283349193632603,
+      "sampling/sampling_logp_difference/max": 5.243195056915283,
+      "sampling/sampling_logp_difference/mean": 0.018456293269991875,
+      "step": 71
+    },
+    {
+      "clip_ratio/high_max": 1.5806871488166507e-05,
+      "clip_ratio/high_mean": 4.739466817227367e-06,
+      "clip_ratio/low_mean": 3.610486896832299e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.084433521711617e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16208.0,
+      "completions/mean_length": 5730.9609375,
+      "completions/mean_terminated_length": 5475.2880859375,
+      "completions/min_length": 433.0,
+      "completions/min_terminated_length": 433.0,
+      "entropy": 0.9486126750707626,
+      "epoch": 0.06623735050597976,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0012298432411625981,
+      "learning_rate": 1e-05,
+      "loss": 0.0208,
+      "num_tokens": 53864049.0,
+      "reward": 0.359375,
+      "reward_std": 0.25460314750671387,
+      "rewards/accuracy_reward/mean": 0.359375,
+      "rewards/accuracy_reward/std": 0.481702595949173,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999348521232605,
+      "sampling/importance_sampling_ratio/min": 4.832820559386164e-05,
+      "sampling/sampling_logp_difference/max": 9.937495231628418,
+      "sampling/sampling_logp_difference/mean": 0.01919996738433838,
+      "step": 72
+    },
+    {
+      "clip_ratio/high_max": 1.2390134997986024e-05,
+      "clip_ratio/high_mean": 3.097533749496506e-06,
+      "clip_ratio/low_mean": 3.8867822581778455e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.19653564449618e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 13500.0,
+      "completions/mean_length": 4620.5703125,
+      "completions/mean_terminated_length": 4527.94482421875,
+      "completions/min_length": 364.0,
+      "completions/min_terminated_length": 364.0,
+      "entropy": 0.9557560831308365,
+      "epoch": 0.0671573137074517,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.002882040338590741,
+      "learning_rate": 1e-05,
+      "loss": 0.0609,
+      "num_tokens": 54473498.0,
+      "reward": 0.3984375,
+      "reward_std": 0.39294686913490295,
+      "rewards/accuracy_reward/mean": 0.3984375,
+      "rewards/accuracy_reward/std": 0.4915000796318054,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998915195465088,
+      "sampling/importance_sampling_ratio/min": 1.577107298089686e-07,
+      "sampling/sampling_logp_difference/max": 15.662503242492676,
+      "sampling/sampling_logp_difference/mean": 0.018525000661611557,
+      "step": 73
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 3.088819471486204e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.088819471486204e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 16314.0,
+      "completions/max_terminated_length": 16314.0,
+      "completions/mean_length": 5074.0703125,
+      "completions/mean_terminated_length": 5074.0703125,
+      "completions/min_length": 342.0,
+      "completions/min_terminated_length": 342.0,
+      "entropy": 0.8830869868397713,
+      "epoch": 0.06807727690892364,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.003324020653963089,
+      "learning_rate": 1e-05,
+      "loss": 0.0305,
+      "num_tokens": 55141787.0,
+      "reward": 0.4609375,
+      "reward_std": 0.30115634202957153,
+      "rewards/accuracy_reward/mean": 0.4609375,
+      "rewards/accuracy_reward/std": 0.5004304051399231,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999203681945801,
+      "sampling/importance_sampling_ratio/min": 0.0009876838885247707,
+      "sampling/sampling_logp_difference/max": 6.920147895812988,
+      "sampling/sampling_logp_difference/mean": 0.018072880804538727,
+      "step": 74
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 1.526649884908693e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 1.526649884908693e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15251.0,
+      "completions/max_terminated_length": 15251.0,
+      "completions/mean_length": 6192.1015625,
+      "completions/mean_terminated_length": 6192.1015625,
+      "completions/min_length": 553.0,
+      "completions/min_terminated_length": 553.0,
+      "entropy": 1.0888547226786613,
+      "epoch": 0.06899724011039558,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0017452294705435634,
+      "learning_rate": 1e-05,
+      "loss": 0.0216,
+      "num_tokens": 55954144.0,
+      "reward": 0.2890625,
+      "reward_std": 0.23250606656074524,
+      "rewards/accuracy_reward/mean": 0.2890625,
+      "rewards/accuracy_reward/std": 0.45510825514793396,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999473690986633,
+      "sampling/importance_sampling_ratio/min": 5.061922365712235e-07,
+      "sampling/sampling_logp_difference/max": 14.496349334716797,
+      "sampling/sampling_logp_difference/mean": 0.021221645176410675,
+      "step": 75
+    },
+    {
+      "clip_ratio/high_max": 1.6768677141953958e-05,
+      "clip_ratio/high_mean": 5.080836899651331e-06,
+      "clip_ratio/low_mean": 3.340929970363504e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.84901372854074e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15740.0,
+      "completions/mean_length": 6204.296875,
+      "completions/mean_terminated_length": 6124.1416015625,
+      "completions/min_length": 294.0,
+      "completions/min_terminated_length": 294.0,
+      "entropy": 1.0423575639724731,
+      "epoch": 0.06991720331186753,
+      "frac_reward_zero_std": 0.125,
+      "grad_norm": 0.0033357341308146715,
+      "learning_rate": 1e-05,
+      "loss": 0.1073,
+      "num_tokens": 56765470.0,
+      "reward": 0.3359375,
+      "reward_std": 0.37875816226005554,
+      "rewards/accuracy_reward/mean": 0.3359375,
+      "rewards/accuracy_reward/std": 0.47417303919792175,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.99998539686203,
+      "sampling/importance_sampling_ratio/min": 4.564182381727733e-05,
+      "sampling/sampling_logp_difference/max": 9.994686126708984,
+      "sampling/sampling_logp_difference/mean": 0.01908688060939312,
+      "step": 76
+    },
+    {
+      "clip_ratio/high_max": 3.149884150843718e-06,
+      "clip_ratio/high_mean": 7.874710377109295e-07,
+      "clip_ratio/low_mean": 2.430614893000893e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.509361991087644e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 14409.0,
+      "completions/max_terminated_length": 14409.0,
+      "completions/mean_length": 5070.3125,
+      "completions/mean_terminated_length": 5070.3125,
+      "completions/min_length": 629.0,
+      "completions/min_terminated_length": 629.0,
+      "entropy": 1.0737399458885193,
+      "epoch": 0.07083716651333946,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0038695367984473705,
+      "learning_rate": 1e-05,
+      "loss": 0.0015,
+      "num_tokens": 57432958.0,
+      "reward": 0.390625,
+      "reward_std": 0.22119548916816711,
+      "rewards/accuracy_reward/mean": 0.390625,
+      "rewards/accuracy_reward/std": 0.4898075461387634,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999223947525024,
+      "sampling/importance_sampling_ratio/min": 1.5509348259001854e-06,
+      "sampling/sampling_logp_difference/max": 13.376652717590332,
+      "sampling/sampling_logp_difference/mean": 0.01970684342086315,
+      "step": 77
+    },
+    {
+      "clip_ratio/high_max": 1.9821940441033803e-05,
+      "clip_ratio/high_mean": 4.955485110258451e-06,
+      "clip_ratio/low_mean": 2.9055729555693688e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.401121466595214e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15799.0,
+      "completions/mean_length": 5750.21875,
+      "completions/mean_terminated_length": 5495.00830078125,
+      "completions/min_length": 76.0,
+      "completions/min_terminated_length": 76.0,
+      "entropy": 0.9708107560873032,
+      "epoch": 0.07175712971481141,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.002927646040916443,
+      "learning_rate": 1e-05,
+      "loss": 0.0166,
+      "num_tokens": 58187426.0,
+      "reward": 0.296875,
+      "reward_std": 0.25460314750671387,
+      "rewards/accuracy_reward/mean": 0.296875,
+      "rewards/accuracy_reward/std": 0.45867621898651123,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999390840530396,
+      "sampling/importance_sampling_ratio/min": 0.015204614959657192,
+      "sampling/sampling_logp_difference/max": 4.186156272888184,
+      "sampling/sampling_logp_difference/mean": 0.019483914598822594,
+      "step": 78
+    },
+    {
+      "clip_ratio/high_max": 2.3815636723156786e-05,
+      "clip_ratio/high_mean": 5.953909180789196e-06,
+      "clip_ratio/low_mean": 4.989707144886779e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.585097960647545e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15938.0,
+      "completions/mean_length": 6067.484375,
+      "completions/mean_terminated_length": 5986.251953125,
+      "completions/min_length": 656.0,
+      "completions/min_terminated_length": 656.0,
+      "entropy": 0.9576351121068001,
+      "epoch": 0.07267709291628335,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.0026169484481215477,
+      "learning_rate": 1e-05,
+      "loss": -0.0055,
+      "num_tokens": 58983336.0,
+      "reward": 0.390625,
+      "reward_std": 0.3406373858451843,
+      "rewards/accuracy_reward/mean": 0.390625,
+      "rewards/accuracy_reward/std": 0.4898075461387634,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999620914459229,
+      "sampling/importance_sampling_ratio/min": 1.974713995878119e-06,
+      "sampling/sampling_logp_difference/max": 13.135087013244629,
+      "sampling/sampling_logp_difference/mean": 0.019007554277777672,
+      "step": 79
+    },
+    {
+      "clip_ratio/high_max": 2.4238934656750644e-05,
+      "clip_ratio/high_mean": 7.786730066072778e-06,
+      "clip_ratio/low_mean": 4.5700241571466904e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.3486972547034384e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 13640.0,
+      "completions/max_terminated_length": 13640.0,
+      "completions/mean_length": 4612.8984375,
+      "completions/mean_terminated_length": 4612.8984375,
+      "completions/min_length": 198.0,
+      "completions/min_terminated_length": 198.0,
+      "entropy": 0.9636320173740387,
+      "epoch": 0.07359705611775529,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0015429699560627341,
+      "learning_rate": 1e-05,
+      "loss": -0.018,
+      "num_tokens": 59590763.0,
+      "reward": 0.421875,
+      "reward_std": 0.34139877557754517,
+      "rewards/accuracy_reward/mean": 0.421875,
+      "rewards/accuracy_reward/std": 0.4957992732524872,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999473094940186,
+      "sampling/importance_sampling_ratio/min": 2.5909587364481013e-08,
+      "sampling/sampling_logp_difference/max": 17.468652725219727,
+      "sampling/sampling_logp_difference/mean": 0.019313856959342957,
+      "step": 80
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 3.0911465842109465e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.0911465842109465e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16300.0,
+      "completions/mean_length": 6101.3125,
+      "completions/mean_terminated_length": 5854.5283203125,
+      "completions/min_length": 179.0,
+      "completions/min_terminated_length": 179.0,
+      "entropy": 0.8831139355897903,
+      "epoch": 0.07451701931922723,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0022505265660583973,
+      "learning_rate": 1e-05,
+      "loss": 0.0813,
+      "num_tokens": 60391283.0,
+      "reward": 0.3125,
+      "reward_std": 0.29302334785461426,
+      "rewards/accuracy_reward/mean": 0.3125,
+      "rewards/accuracy_reward/std": 0.4653336703777313,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000061988830566,
+      "sampling/importance_sampling_ratio/min": 0.0003816343960352242,
+      "sampling/sampling_logp_difference/max": 7.871047496795654,
+      "sampling/sampling_logp_difference/mean": 0.018377842381596565,
+      "step": 81
+    },
+    {
+      "clip_ratio/high_max": 1.547606643725885e-05,
+      "clip_ratio/high_mean": 3.869016609314713e-06,
+      "clip_ratio/low_mean": 2.478705800967873e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.8656074391619768e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14862.0,
+      "completions/mean_length": 4705.9921875,
+      "completions/mean_terminated_length": 4614.03955078125,
+      "completions/min_length": 86.0,
+      "completions/min_terminated_length": 86.0,
+      "entropy": 0.9557913094758987,
+      "epoch": 0.07543698252069918,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.002069958718493581,
+      "learning_rate": 1e-05,
+      "loss": -0.0015,
+      "num_tokens": 61021490.0,
+      "reward": 0.4296875,
+      "reward_std": 0.2637920379638672,
+      "rewards/accuracy_reward/mean": 0.4296875,
+      "rewards/accuracy_reward/std": 0.4969765841960907,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999030232429504,
+      "sampling/importance_sampling_ratio/min": 2.76673017651774e-05,
+      "sampling/sampling_logp_difference/max": 10.495259284973145,
+      "sampling/sampling_logp_difference/mean": 0.018629569560289383,
+      "step": 82
+    },
+    {
+      "clip_ratio/high_max": 2.0910484636260662e-05,
+      "clip_ratio/high_mean": 5.2276211590651656e-06,
+      "clip_ratio/low_mean": 1.952954164607945e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.4757162805144617e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 13745.0,
+      "completions/max_terminated_length": 13745.0,
+      "completions/mean_length": 5116.78125,
+      "completions/mean_terminated_length": 5116.78125,
+      "completions/min_length": 12.0,
+      "completions/min_terminated_length": 12.0,
+      "entropy": 1.0198405236005783,
+      "epoch": 0.07635694572217111,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0034461067989468575,
+      "learning_rate": 1e-05,
+      "loss": -0.0073,
+      "num_tokens": 61695382.0,
+      "reward": 0.265625,
+      "reward_std": 0.30774885416030884,
+      "rewards/accuracy_reward/mean": 0.265625,
+      "rewards/accuracy_reward/std": 0.44340085983276367,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999936819076538,
+      "sampling/importance_sampling_ratio/min": 0.012227212078869343,
+      "sampling/sampling_logp_difference/max": 4.4040913581848145,
+      "sampling/sampling_logp_difference/mean": 0.019400250166654587,
+      "step": 83
+    },
+    {
+      "clip_ratio/high_max": 1.5340228401328204e-05,
+      "clip_ratio/high_mean": 3.835057100332051e-06,
+      "clip_ratio/low_mean": 3.150914017169271e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.534419727202476e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15953.0,
+      "completions/mean_length": 5891.9140625,
+      "completions/mean_terminated_length": 5553.45947265625,
+      "completions/min_length": 79.0,
+      "completions/min_terminated_length": 79.0,
+      "entropy": 0.9568078517913818,
+      "epoch": 0.07727690892364306,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0025854657869786024,
+      "learning_rate": 1e-05,
+      "loss": 0.1013,
+      "num_tokens": 62474883.0,
+      "reward": 0.3203125,
+      "reward_std": 0.31010788679122925,
+      "rewards/accuracy_reward/mean": 0.3203125,
+      "rewards/accuracy_reward/std": 0.4684300124645233,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0001013278961182,
+      "sampling/importance_sampling_ratio/min": 0.0015072470996528864,
+      "sampling/sampling_logp_difference/max": 6.497470378875732,
+      "sampling/sampling_logp_difference/mean": 0.019574139267206192,
+      "step": 84
+    },
+    {
+      "clip_ratio/high_max": 1.108303422370227e-05,
+      "clip_ratio/high_mean": 2.7707585559255676e-06,
+      "clip_ratio/low_mean": 2.2325777763398946e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.5096536319324514e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 13671.0,
+      "completions/mean_length": 5300.3359375,
+      "completions/mean_terminated_length": 5213.06298828125,
+      "completions/min_length": 17.0,
+      "completions/min_terminated_length": 17.0,
+      "entropy": 0.9722280204296112,
+      "epoch": 0.078196872125115,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0025075653102248907,
+      "learning_rate": 1e-05,
+      "loss": 0.0312,
+      "num_tokens": 63172454.0,
+      "reward": 0.203125,
+      "reward_std": 0.21542152762413025,
+      "rewards/accuracy_reward/mean": 0.203125,
+      "rewards/accuracy_reward/std": 0.40390563011169434,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999643564224243,
+      "sampling/importance_sampling_ratio/min": 0.00020346972451079637,
+      "sampling/sampling_logp_difference/max": 8.499993324279785,
+      "sampling/sampling_logp_difference/mean": 0.02002432942390442,
+      "step": 85
+    },
+    {
+      "clip_ratio/high_max": 1.3991947980684927e-05,
+      "clip_ratio/high_mean": 3.4979869951712317e-06,
+      "clip_ratio/low_mean": 4.893367201930232e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.243165958290774e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15617.0,
+      "completions/mean_length": 6364.21875,
+      "completions/mean_terminated_length": 6205.1748046875,
+      "completions/min_length": 215.0,
+      "completions/min_terminated_length": 215.0,
+      "entropy": 1.0607495978474617,
+      "epoch": 0.07911683532658693,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0017982006538659334,
+      "learning_rate": 1e-05,
+      "loss": -0.0117,
+      "num_tokens": 64007602.0,
+      "reward": 0.2890625,
+      "reward_std": 0.21778053045272827,
+      "rewards/accuracy_reward/mean": 0.2890625,
+      "rewards/accuracy_reward/std": 0.45510825514793396,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000064373016357,
+      "sampling/importance_sampling_ratio/min": 3.823801307589747e-05,
+      "sampling/sampling_logp_difference/max": 10.171680450439453,
+      "sampling/sampling_logp_difference/mean": 0.020373597741127014,
+      "step": 86
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 2.6416430046083406e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.6416430046083406e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14709.0,
+      "completions/mean_length": 5746.3125,
+      "completions/mean_terminated_length": 5403.1611328125,
+      "completions/min_length": 287.0,
+      "completions/min_terminated_length": 287.0,
+      "entropy": 0.9913106113672256,
+      "epoch": 0.08003679852805888,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.002207317156717181,
+      "learning_rate": 1e-05,
+      "loss": 0.063,
+      "num_tokens": 64762058.0,
+      "reward": 0.34375,
+      "reward_std": 0.3264310359954834,
+      "rewards/accuracy_reward/mean": 0.34375,
+      "rewards/accuracy_reward/std": 0.47682511806488037,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999239444732666,
+      "sampling/importance_sampling_ratio/min": 5.3444750847120304e-08,
+      "sampling/sampling_logp_difference/max": 16.744617462158203,
+      "sampling/sampling_logp_difference/mean": 0.020608089864253998,
+      "step": 87
+    },
+    {
+      "clip_ratio/high_max": 1.2681661701208213e-05,
+      "clip_ratio/high_mean": 3.1704154253020533e-06,
+      "clip_ratio/low_mean": 3.541917828897567e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.85895939416514e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16292.0,
+      "completions/mean_length": 6088.5625,
+      "completions/mean_terminated_length": 5841.47216796875,
+      "completions/min_length": 159.0,
+      "completions/min_terminated_length": 159.0,
+      "entropy": 0.9040444120764732,
+      "epoch": 0.08095676172953081,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0012974507408216596,
+      "learning_rate": 1e-05,
+      "loss": 0.0401,
+      "num_tokens": 65561002.0,
+      "reward": 0.3671875,
+      "reward_std": 0.2477683573961258,
+      "rewards/accuracy_reward/mean": 0.3671875,
+      "rewards/accuracy_reward/std": 0.4839322865009308,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998487234115601,
+      "sampling/importance_sampling_ratio/min": 6.021501121722395e-06,
+      "sampling/sampling_logp_difference/max": 12.020174026489258,
+      "sampling/sampling_logp_difference/mean": 0.01939838007092476,
+      "step": 88
+    },
+    {
+      "clip_ratio/high_max": 7.807132533343975e-06,
+      "clip_ratio/high_mean": 1.9517831333359936e-06,
+      "clip_ratio/low_mean": 1.8564539345788944e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.05163223654381e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15021.0,
+      "completions/mean_length": 5765.5,
+      "completions/mean_terminated_length": 5510.65625,
+      "completions/min_length": 412.0,
+      "completions/min_terminated_length": 412.0,
+      "entropy": 0.9966336265206337,
+      "epoch": 0.08187672493100276,
+      "frac_reward_zero_std": 0.6875,
+      "grad_norm": 0.0013380619930103421,
+      "learning_rate": 1e-05,
+      "loss": 0.0522,
+      "num_tokens": 66318482.0,
+      "reward": 0.375,
+      "reward_std": 0.13994136452674866,
+      "rewards/accuracy_reward/mean": 0.375,
+      "rewards/accuracy_reward/std": 0.4860251843929291,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999471306800842,
+      "sampling/importance_sampling_ratio/min": 7.288413598871557e-06,
+      "sampling/sampling_logp_difference/max": 11.829224586486816,
+      "sampling/sampling_logp_difference/mean": 0.018109245225787163,
+      "step": 89
+    },
+    {
+      "clip_ratio/high_max": 1.7906912489706883e-05,
+      "clip_ratio/high_mean": 4.476728122426721e-06,
+      "clip_ratio/low_mean": 2.5812531305291486e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.0289259655091882e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16120.0,
+      "completions/mean_length": 5462.78125,
+      "completions/mean_terminated_length": 5200.67236328125,
+      "completions/min_length": 460.0,
+      "completions/min_terminated_length": 460.0,
+      "entropy": 0.9345141425728798,
+      "epoch": 0.0827966881324747,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0023930128663778305,
+      "learning_rate": 1e-05,
+      "loss": 0.0475,
+      "num_tokens": 67038582.0,
+      "reward": 0.46875,
+      "reward_std": 0.24435341358184814,
+      "rewards/accuracy_reward/mean": 0.46875,
+      "rewards/accuracy_reward/std": 0.5009832978248596,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999513030052185,
+      "sampling/importance_sampling_ratio/min": 0.008508839644491673,
+      "sampling/sampling_logp_difference/max": 4.7666497230529785,
+      "sampling/sampling_logp_difference/mean": 0.019220296293497086,
+      "step": 90
+    },
+    {
+      "clip_ratio/high_max": 1.551389118503721e-05,
+      "clip_ratio/high_mean": 3.878472796259302e-06,
+      "clip_ratio/low_mean": 3.239646628117043e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.6274939645863924e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15034.0,
+      "completions/max_terminated_length": 15034.0,
+      "completions/mean_length": 5547.5078125,
+      "completions/mean_terminated_length": 5547.5078125,
+      "completions/min_length": 14.0,
+      "completions/min_terminated_length": 14.0,
+      "entropy": 1.0511749312281609,
+      "epoch": 0.08371665133394664,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.0013633714988827705,
+      "learning_rate": 1e-05,
+      "loss": 0.0462,
+      "num_tokens": 67774487.0,
+      "reward": 0.203125,
+      "reward_std": 0.19438527524471283,
+      "rewards/accuracy_reward/mean": 0.203125,
+      "rewards/accuracy_reward/std": 0.40390563011169434,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999545216560364,
+      "sampling/importance_sampling_ratio/min": 1.0995515367540065e-05,
+      "sampling/sampling_logp_difference/max": 11.418023109436035,
+      "sampling/sampling_logp_difference/mean": 0.020328814163804054,
+      "step": 91
+    },
+    {
+      "clip_ratio/high_max": 1.5384989410449634e-05,
+      "clip_ratio/high_mean": 3.846247352612409e-06,
+      "clip_ratio/low_mean": 3.441604167164769e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.826228908110352e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14029.0,
+      "completions/mean_length": 5835.4140625,
+      "completions/mean_terminated_length": 5406.609375,
+      "completions/min_length": 384.0,
+      "completions/min_terminated_length": 384.0,
+      "entropy": 1.0024723336100578,
+      "epoch": 0.08463661453541858,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0036165034398436546,
+      "learning_rate": 1e-05,
+      "loss": 0.0373,
+      "num_tokens": 68541660.0,
+      "reward": 0.34375,
+      "reward_std": 0.3584783673286438,
+      "rewards/accuracy_reward/mean": 0.34375,
+      "rewards/accuracy_reward/std": 0.47682511806488037,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999669790267944,
+      "sampling/importance_sampling_ratio/min": 9.518130354990717e-06,
+      "sampling/sampling_logp_difference/max": 11.562312126159668,
+      "sampling/sampling_logp_difference/mean": 0.020469525828957558,
+      "step": 92
+    },
+    {
+      "clip_ratio/high_max": 6.105602551542688e-06,
+      "clip_ratio/high_mean": 1.526400637885672e-06,
+      "clip_ratio/low_mean": 5.3129634352444555e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.46560352177039e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15695.0,
+      "completions/mean_length": 6252.609375,
+      "completions/mean_terminated_length": 6172.83447265625,
+      "completions/min_length": 481.0,
+      "completions/min_terminated_length": 481.0,
+      "entropy": 1.0325519517064095,
+      "epoch": 0.08555657773689053,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0022011541295796633,
+      "learning_rate": 1e-05,
+      "loss": 0.036,
+      "num_tokens": 69365418.0,
+      "reward": 0.3828125,
+      "reward_std": 0.32301604747772217,
+      "rewards/accuracy_reward/mean": 0.3828125,
+      "rewards/accuracy_reward/std": 0.4879830479621887,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998809099197388,
+      "sampling/importance_sampling_ratio/min": 0.0005531083443202078,
+      "sampling/sampling_logp_difference/max": 7.4999566078186035,
+      "sampling/sampling_logp_difference/mean": 0.02079072594642639,
+      "step": 93
+    },
+    {
+      "clip_ratio/high_max": 4.348128641140647e-06,
+      "clip_ratio/high_mean": 1.0870321602851618e-06,
+      "clip_ratio/low_mean": 3.0097819148977578e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.118485085451539e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15316.0,
+      "completions/max_terminated_length": 15316.0,
+      "completions/mean_length": 5581.484375,
+      "completions/mean_terminated_length": 5581.484375,
+      "completions/min_length": 461.0,
+      "completions/min_terminated_length": 461.0,
+      "entropy": 0.9222500994801521,
+      "epoch": 0.08647654093836246,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.002300912281498313,
+      "learning_rate": 1e-05,
+      "loss": -0.0007,
+      "num_tokens": 70099320.0,
+      "reward": 0.296875,
+      "reward_std": 0.2959064245223999,
+      "rewards/accuracy_reward/mean": 0.296875,
+      "rewards/accuracy_reward/std": 0.45867621898651123,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998577833175659,
+      "sampling/importance_sampling_ratio/min": 8.140386853483506e-08,
+      "sampling/sampling_logp_difference/max": 16.323843002319336,
+      "sampling/sampling_logp_difference/mean": 0.01952272653579712,
+      "step": 94
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 3.5122252029395895e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.5122252029395895e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15781.0,
+      "completions/max_terminated_length": 15781.0,
+      "completions/mean_length": 5424.140625,
+      "completions/mean_terminated_length": 5424.140625,
+      "completions/min_length": 124.0,
+      "completions/min_terminated_length": 124.0,
+      "entropy": 1.0446564108133316,
+      "epoch": 0.08739650413983441,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0016312639927491546,
+      "learning_rate": 1e-05,
+      "loss": 0.0119,
+      "num_tokens": 70811474.0,
+      "reward": 0.359375,
+      "reward_std": 0.31246691942214966,
+      "rewards/accuracy_reward/mean": 0.359375,
+      "rewards/accuracy_reward/std": 0.481702595949173,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000094175338745,
+      "sampling/importance_sampling_ratio/min": 0.0021919538266956806,
+      "sampling/sampling_logp_difference/max": 6.12296199798584,
+      "sampling/sampling_logp_difference/mean": 0.019741754978895187,
+      "step": 95
+    },
+    {
+      "clip_ratio/high_max": 1.0354576261306647e-05,
+      "clip_ratio/high_mean": 3.496124691082514e-06,
+      "clip_ratio/low_mean": 4.096481598026003e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.446094089871622e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15755.0,
+      "completions/max_terminated_length": 15755.0,
+      "completions/mean_length": 5884.9609375,
+      "completions/mean_terminated_length": 5884.9609375,
+      "completions/min_length": 382.0,
+      "completions/min_terminated_length": 382.0,
+      "entropy": 0.9605691060423851,
+      "epoch": 0.08831646734130635,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0032865386456251144,
+      "learning_rate": 1e-05,
+      "loss": 0.0451,
+      "num_tokens": 71582701.0,
+      "reward": 0.4140625,
+      "reward_std": 0.3514111638069153,
+      "rewards/accuracy_reward/mean": 0.4140625,
+      "rewards/accuracy_reward/std": 0.49449479579925537,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999833106994629,
+      "sampling/importance_sampling_ratio/min": 1.149311810877407e-05,
+      "sampling/sampling_logp_difference/max": 11.373762130737305,
+      "sampling/sampling_logp_difference/mean": 0.019438734278082848,
+      "step": 96
+    },
+    {
+      "clip_ratio/high_max": 1.026998006636859e-05,
+      "clip_ratio/high_mean": 2.5674950165921473e-06,
+      "clip_ratio/low_mean": 3.5440503552308655e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.8007998455213965e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15361.0,
+      "completions/max_terminated_length": 15361.0,
+      "completions/mean_length": 4835.09375,
+      "completions/mean_terminated_length": 4835.09375,
+      "completions/min_length": 826.0,
+      "completions/min_terminated_length": 826.0,
+      "entropy": 0.9038172215223312,
+      "epoch": 0.08923643054277829,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.004721678793430328,
+      "learning_rate": 1e-05,
+      "loss": 0.1143,
+      "num_tokens": 72220025.0,
+      "reward": 0.4765625,
+      "reward_std": 0.38481879234313965,
+      "rewards/accuracy_reward/mean": 0.4765625,
+      "rewards/accuracy_reward/std": 0.5014128684997559,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.99994957447052,
+      "sampling/importance_sampling_ratio/min": 2.710051205667696e-07,
+      "sampling/sampling_logp_difference/max": 15.12112808227539,
+      "sampling/sampling_logp_difference/mean": 0.017888439819216728,
+      "step": 97
+    },
+    {
+      "clip_ratio/high_max": 2.93432283342554e-05,
+      "clip_ratio/high_mean": 9.56252398509605e-06,
+      "clip_ratio/low_mean": 4.7865792453194445e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.742831808674964e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14431.0,
+      "completions/mean_length": 5979.078125,
+      "completions/mean_terminated_length": 5897.1494140625,
+      "completions/min_length": 241.0,
+      "completions/min_terminated_length": 241.0,
+      "entropy": 1.0227951630949974,
+      "epoch": 0.09015639374425023,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0010532280430197716,
+      "learning_rate": 1e-05,
+      "loss": 0.0187,
+      "num_tokens": 73005515.0,
+      "reward": 0.2890625,
+      "reward_std": 0.30115631222724915,
+      "rewards/accuracy_reward/mean": 0.2890625,
+      "rewards/accuracy_reward/std": 0.45510825514793396,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999090433120728,
+      "sampling/importance_sampling_ratio/min": 0.00030157779110595584,
+      "sampling/sampling_logp_difference/max": 8.10648250579834,
+      "sampling/sampling_logp_difference/mean": 0.019633149728178978,
+      "step": 98
+    },
+    {
+      "clip_ratio/high_max": 4.203234766464448e-06,
+      "clip_ratio/high_mean": 1.050808691616112e-06,
+      "clip_ratio/low_mean": 2.5574990331733716e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.6625799137036665e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15886.0,
+      "completions/max_terminated_length": 15886.0,
+      "completions/mean_length": 4292.1796875,
+      "completions/mean_terminated_length": 4292.1796875,
+      "completions/min_length": 159.0,
+      "completions/min_terminated_length": 159.0,
+      "entropy": 0.8719984591007233,
+      "epoch": 0.09107635694572216,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0038324075285345316,
+      "learning_rate": 1e-05,
+      "loss": 0.0669,
+      "num_tokens": 73572794.0,
+      "reward": 0.4375,
+      "reward_std": 0.2972046136856079,
+      "rewards/accuracy_reward/mean": 0.4375,
+      "rewards/accuracy_reward/std": 0.49802759289741516,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999188780784607,
+      "sampling/importance_sampling_ratio/min": 0.015675775706768036,
+      "sampling/sampling_logp_difference/max": 4.155638694763184,
+      "sampling/sampling_logp_difference/mean": 0.018074234947562218,
+      "step": 99
+    },
+    {
+      "clip_ratio/high_max": 4.431366960488958e-06,
+      "clip_ratio/high_mean": 1.1078417401222396e-06,
+      "clip_ratio/low_mean": 4.433405501913512e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.54418968729442e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 14674.0,
+      "completions/max_terminated_length": 14674.0,
+      "completions/mean_length": 5449.2890625,
+      "completions/mean_terminated_length": 5449.2890625,
+      "completions/min_length": 635.0,
+      "completions/min_terminated_length": 635.0,
+      "entropy": 0.9137986451387405,
+      "epoch": 0.09199632014719411,
+      "frac_reward_zero_std": 0.125,
+      "grad_norm": 0.004843447357416153,
+      "learning_rate": 1e-05,
+      "loss": 0.0166,
+      "num_tokens": 74289607.0,
+      "reward": 0.5,
+      "reward_std": 0.40609243512153625,
+      "rewards/accuracy_reward/mean": 0.5,
+      "rewards/accuracy_reward/std": 0.5019646286964417,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999977707862854,
+      "sampling/importance_sampling_ratio/min": 8.851584993863071e-07,
+      "sampling/sampling_logp_difference/max": 13.937499046325684,
+      "sampling/sampling_logp_difference/mean": 0.018183842301368713,
+      "step": 100
+    },
+    {
+      "clip_ratio/high_max": 8.212076863856055e-06,
+      "clip_ratio/high_mean": 2.0530192159640137e-06,
+      "clip_ratio/low_mean": 3.6279372466196946e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.833239122741361e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 16163.0,
+      "completions/max_terminated_length": 16163.0,
+      "completions/mean_length": 4983.3515625,
+      "completions/mean_terminated_length": 4983.3515625,
+      "completions/min_length": 541.0,
+      "completions/min_terminated_length": 541.0,
+      "entropy": 0.9354705810546875,
+      "epoch": 0.09291628334866606,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0037651765160262585,
+      "learning_rate": 1e-05,
+      "loss": 0.0463,
+      "num_tokens": 74946484.0,
+      "reward": 0.3671875,
+      "reward_std": 0.3090519309043884,
+      "rewards/accuracy_reward/mean": 0.3671875,
+      "rewards/accuracy_reward/std": 0.4839322865009308,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999549984931946,
+      "sampling/importance_sampling_ratio/min": 0.00011593531962716952,
+      "sampling/sampling_logp_difference/max": 9.062478065490723,
+      "sampling/sampling_logp_difference/mean": 0.018207306042313576,
+      "step": 101
+    },
+    {
+      "clip_ratio/high_max": 1.3182888324081432e-05,
+      "clip_ratio/high_mean": 3.295722081020358e-06,
+      "clip_ratio/low_mean": 2.544108633628639e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.8736808644680423e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16039.0,
+      "completions/mean_length": 6351.1015625,
+      "completions/mean_terminated_length": 6027.45947265625,
+      "completions/min_length": 804.0,
+      "completions/min_terminated_length": 804.0,
+      "entropy": 0.9310042560100555,
+      "epoch": 0.09383624655013799,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0009160125628113747,
+      "learning_rate": 1e-05,
+      "loss": -0.023,
+      "num_tokens": 75779145.0,
+      "reward": 0.3828125,
+      "reward_std": 0.24329257011413574,
+      "rewards/accuracy_reward/mean": 0.3828125,
+      "rewards/accuracy_reward/std": 0.4879830479621887,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998877048492432,
+      "sampling/importance_sampling_ratio/min": 0.0002961359277833253,
+      "sampling/sampling_logp_difference/max": 8.1246919631958,
+      "sampling/sampling_logp_difference/mean": 0.018513178452849388,
+      "step": 102
+    },
+    {
+      "clip_ratio/high_max": 1.1402620202716207e-05,
+      "clip_ratio/high_mean": 3.935649147024378e-06,
+      "clip_ratio/low_mean": 3.059757568735222e-05,
+      "clip_ratio/low_min": 4.3258582991256844e-06,
+      "clip_ratio/region_mean": 3.45332257438713e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14471.0,
+      "completions/mean_length": 5293.40625,
+      "completions/mean_terminated_length": 4935.64501953125,
+      "completions/min_length": 222.0,
+      "completions/min_terminated_length": 222.0,
+      "entropy": 1.0732879787683487,
+      "epoch": 0.09475620975160993,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0023993055801838636,
+      "learning_rate": 1e-05,
+      "loss": 0.1021,
+      "num_tokens": 76475557.0,
+      "reward": 0.34375,
+      "reward_std": 0.2835350036621094,
+      "rewards/accuracy_reward/mean": 0.34375,
+      "rewards/accuracy_reward/std": 0.47682511806488037,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.000077724456787,
+      "sampling/importance_sampling_ratio/min": 6.613240111619234e-05,
+      "sampling/sampling_logp_difference/max": 9.623851776123047,
+      "sampling/sampling_logp_difference/mean": 0.020792219787836075,
+      "step": 103
+    },
+    {
+      "clip_ratio/high_max": 2.130644793396641e-05,
+      "clip_ratio/high_mean": 8.929533635182452e-06,
+      "clip_ratio/low_mean": 2.663600798769039e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.556554071337814e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16305.0,
+      "completions/mean_length": 7619.7578125,
+      "completions/mean_terminated_length": 7409.41650390625,
+      "completions/min_length": 25.0,
+      "completions/min_terminated_length": 25.0,
+      "entropy": 0.9646238535642624,
+      "epoch": 0.09567617295308188,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0014872358879074454,
+      "learning_rate": 1e-05,
+      "loss": 0.0439,
+      "num_tokens": 77474310.0,
+      "reward": 0.34375,
+      "reward_std": 0.33114904165267944,
+      "rewards/accuracy_reward/mean": 0.34375,
+      "rewards/accuracy_reward/std": 0.47682511806488037,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999638795852661,
+      "sampling/importance_sampling_ratio/min": 0.0016686831368133426,
+      "sampling/sampling_logp_difference/max": 6.395720481872559,
+      "sampling/sampling_logp_difference/mean": 0.020074717700481415,
+      "step": 104
+    },
+    {
+      "clip_ratio/high_max": 1.7765815300663235e-05,
+      "clip_ratio/high_mean": 5.154013138053415e-06,
+      "clip_ratio/low_mean": 5.166909659237717e-05,
+      "clip_ratio/low_min": 8.365680514543783e-06,
+      "clip_ratio/region_mean": 5.68231100714911e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15984.0,
+      "completions/max_terminated_length": 15984.0,
+      "completions/mean_length": 5959.921875,
+      "completions/mean_terminated_length": 5959.921875,
+      "completions/min_length": 55.0,
+      "completions/min_terminated_length": 55.0,
+      "entropy": 1.004471093416214,
+      "epoch": 0.09659613615455381,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.00398358516395092,
+      "learning_rate": 1e-05,
+      "loss": 0.1016,
+      "num_tokens": 78257132.0,
+      "reward": 0.359375,
+      "reward_std": 0.3653082847595215,
+      "rewards/accuracy_reward/mean": 0.359375,
+      "rewards/accuracy_reward/std": 0.481702595949173,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000170469284058,
+      "sampling/importance_sampling_ratio/min": 0.0030075267422944307,
+      "sampling/sampling_logp_difference/max": 5.806637287139893,
+      "sampling/sampling_logp_difference/mean": 0.020755283534526825,
+      "step": 105
+    },
+    {
+      "clip_ratio/high_max": 1.6946955838648137e-05,
+      "clip_ratio/high_mean": 4.236738959662034e-06,
+      "clip_ratio/low_mean": 4.510891039899434e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.934564867653535e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 13736.0,
+      "completions/mean_length": 5427.03125,
+      "completions/mean_terminated_length": 5340.755859375,
+      "completions/min_length": 556.0,
+      "completions/min_terminated_length": 556.0,
+      "entropy": 0.9117375314235687,
+      "epoch": 0.09751609935602576,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0019883522763848305,
+      "learning_rate": 1e-05,
+      "loss": 0.01,
+      "num_tokens": 78971072.0,
+      "reward": 0.375,
+      "reward_std": 0.31694266200065613,
+      "rewards/accuracy_reward/mean": 0.375,
+      "rewards/accuracy_reward/std": 0.4860251843929291,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000550746917725,
+      "sampling/importance_sampling_ratio/min": 0.0008046010043472052,
+      "sampling/sampling_logp_difference/max": 7.125164031982422,
+      "sampling/sampling_logp_difference/mean": 0.018812140449881554,
+      "step": 106
+    },
+    {
+      "clip_ratio/high_max": 2.968176841022796e-05,
+      "clip_ratio/high_mean": 7.42044210255699e-06,
+      "clip_ratio/low_mean": 3.220799408154562e-05,
+      "clip_ratio/low_min": 5.315981979947537e-06,
+      "clip_ratio/region_mean": 3.962843629778945e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 16293.0,
+      "completions/max_terminated_length": 16293.0,
+      "completions/mean_length": 6062.078125,
+      "completions/mean_terminated_length": 6062.078125,
+      "completions/min_length": 108.0,
+      "completions/min_terminated_length": 108.0,
+      "entropy": 1.0164100378751755,
+      "epoch": 0.0984360625574977,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.00450351694598794,
+      "learning_rate": 1e-05,
+      "loss": 0.0426,
+      "num_tokens": 79764434.0,
+      "reward": 0.2578125,
+      "reward_std": 0.26355957984924316,
+      "rewards/accuracy_reward/mean": 0.2578125,
+      "rewards/accuracy_reward/std": 0.43914902210235596,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999713897705078,
+      "sampling/importance_sampling_ratio/min": 0.0007411236292682588,
+      "sampling/sampling_logp_difference/max": 7.207343101501465,
+      "sampling/sampling_logp_difference/mean": 0.020526543259620667,
+      "step": 107
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 4.856050622947805e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.856050622947805e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 13689.0,
+      "completions/max_terminated_length": 13689.0,
+      "completions/mean_length": 4856.53125,
+      "completions/mean_terminated_length": 4856.53125,
+      "completions/min_length": 191.0,
+      "completions/min_terminated_length": 191.0,
+      "entropy": 1.0780886858701706,
+      "epoch": 0.09935602575896964,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.0033157530706375837,
+      "learning_rate": 1e-05,
+      "loss": 0.046,
+      "num_tokens": 80405238.0,
+      "reward": 0.3359375,
+      "reward_std": 0.3487703502178192,
+      "rewards/accuracy_reward/mean": 0.3359375,
+      "rewards/accuracy_reward/std": 0.47417303919792175,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999889135360718,
+      "sampling/importance_sampling_ratio/min": 0.033773623406887054,
+      "sampling/sampling_logp_difference/max": 3.7256407737731934,
+      "sampling/sampling_logp_difference/mean": 0.019188418984413147,
+      "step": 108
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 1.975351790406421e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 1.975351790406421e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 16335.0,
+      "completions/max_terminated_length": 16335.0,
+      "completions/mean_length": 3930.5859375,
+      "completions/mean_terminated_length": 3930.5859375,
+      "completions/min_length": 4.0,
+      "completions/min_terminated_length": 4.0,
+      "entropy": 0.8666863515973091,
+      "epoch": 0.10027598896044158,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.005471619311720133,
+      "learning_rate": 1e-05,
+      "loss": -0.0779,
+      "num_tokens": 80926721.0,
+      "reward": 0.5859375,
+      "reward_std": 0.3164186179637909,
+      "rewards/accuracy_reward/mean": 0.5859375,
+      "rewards/accuracy_reward/std": 0.49449479579925537,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000040531158447,
+      "sampling/importance_sampling_ratio/min": 0.0002562212466727942,
+      "sampling/sampling_logp_difference/max": 8.269469261169434,
+      "sampling/sampling_logp_difference/mean": 0.017708823084831238,
+      "step": 109
+    },
+    {
+      "clip_ratio/high_max": 6.743997801095247e-06,
+      "clip_ratio/high_mean": 1.6859994502738118e-06,
+      "clip_ratio/low_mean": 3.61007656692891e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.7786765119562915e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15546.0,
+      "completions/mean_length": 5934.9453125,
+      "completions/mean_terminated_length": 5684.16845703125,
+      "completions/min_length": 229.0,
+      "completions/min_terminated_length": 229.0,
+      "entropy": 0.9991667941212654,
+      "epoch": 0.10119595216191353,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.002580739092081785,
+      "learning_rate": 1e-05,
+      "loss": -0.0065,
+      "num_tokens": 81707978.0,
+      "reward": 0.3046875,
+      "reward_std": 0.24671243131160736,
+      "rewards/accuracy_reward/mean": 0.3046875,
+      "rewards/accuracy_reward/std": 0.46208351850509644,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000852346420288,
+      "sampling/importance_sampling_ratio/min": 0.002478762762621045,
+      "sampling/sampling_logp_difference/max": 5.999995708465576,
+      "sampling/sampling_logp_difference/mean": 0.019801246002316475,
+      "step": 110
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 2.43532002741631e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.43532002741631e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16010.0,
+      "completions/mean_length": 5866.84375,
+      "completions/mean_terminated_length": 5699.9052734375,
+      "completions/min_length": 499.0,
+      "completions/min_terminated_length": 499.0,
+      "entropy": 0.9848997294902802,
+      "epoch": 0.10211591536338546,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0010949905263260007,
+      "learning_rate": 1e-05,
+      "loss": 0.0266,
+      "num_tokens": 82477310.0,
+      "reward": 0.2734375,
+      "reward_std": 0.26933354139328003,
+      "rewards/accuracy_reward/mean": 0.2734375,
+      "rewards/accuracy_reward/std": 0.447474867105484,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999667406082153,
+      "sampling/importance_sampling_ratio/min": 9.04304688447155e-05,
+      "sampling/sampling_logp_difference/max": 9.310929298400879,
+      "sampling/sampling_logp_difference/mean": 0.020769795402884483,
+      "step": 111
+    },
+    {
+      "clip_ratio/high_max": 1.9307613456476247e-05,
+      "clip_ratio/high_mean": 4.826903364119062e-06,
+      "clip_ratio/low_mean": 5.842190330440644e-05,
+      "clip_ratio/low_min": 1.2287753634154797e-05,
+      "clip_ratio/region_mean": 6.324880496322294e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 14501.0,
+      "completions/max_terminated_length": 14501.0,
+      "completions/mean_length": 6613.7578125,
+      "completions/mean_terminated_length": 6613.7578125,
+      "completions/min_length": 1033.0,
+      "completions/min_terminated_length": 1033.0,
+      "entropy": 0.9176012054085732,
+      "epoch": 0.10303587856485741,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0020384234376251698,
+      "learning_rate": 1e-05,
+      "loss": 0.0571,
+      "num_tokens": 83345055.0,
+      "reward": 0.3671875,
+      "reward_std": 0.2845958471298218,
+      "rewards/accuracy_reward/mean": 0.3671875,
+      "rewards/accuracy_reward/std": 0.4839322865009308,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999457001686096,
+      "sampling/importance_sampling_ratio/min": 0.029541675001382828,
+      "sampling/sampling_logp_difference/max": 3.5219533443450928,
+      "sampling/sampling_logp_difference/mean": 0.018883168697357178,
+      "step": 112
+    },
+    {
+      "clip_ratio/high_max": 1.382043183184578e-05,
+      "clip_ratio/high_mean": 3.455107957961445e-06,
+      "clip_ratio/low_mean": 5.789885449303256e-05,
+      "clip_ratio/low_min": 1.017130716718384e-05,
+      "clip_ratio/region_mean": 6.135396188255982e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16310.0,
+      "completions/mean_length": 6392.3125,
+      "completions/mean_terminated_length": 6070.0,
+      "completions/min_length": 507.0,
+      "completions/min_terminated_length": 507.0,
+      "entropy": 0.904954232275486,
+      "epoch": 0.10395584176632934,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0031166900880634785,
+      "learning_rate": 1e-05,
+      "loss": 0.0351,
+      "num_tokens": 84186343.0,
+      "reward": 0.390625,
+      "reward_std": 0.34139877557754517,
+      "rewards/accuracy_reward/mean": 0.390625,
+      "rewards/accuracy_reward/std": 0.4898075461387634,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999208450317383,
+      "sampling/importance_sampling_ratio/min": 0.00022529886336997151,
+      "sampling/sampling_logp_difference/max": 8.398082733154297,
+      "sampling/sampling_logp_difference/mean": 0.01931958645582199,
+      "step": 113
+    },
+    {
+      "clip_ratio/high_max": 1.7221671441802755e-05,
+      "clip_ratio/high_mean": 6.549099907715572e-06,
+      "clip_ratio/low_mean": 3.147818074467068e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.802728065238625e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16180.0,
+      "completions/mean_length": 5982.703125,
+      "completions/mean_terminated_length": 5817.603515625,
+      "completions/min_length": 294.0,
+      "completions/min_terminated_length": 294.0,
+      "entropy": 0.8394555225968361,
+      "epoch": 0.10487580496780129,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0022041688207536936,
+      "learning_rate": 1e-05,
+      "loss": 0.1043,
+      "num_tokens": 84971129.0,
+      "reward": 0.3125,
+      "reward_std": 0.30774885416030884,
+      "rewards/accuracy_reward/mean": 0.3125,
+      "rewards/accuracy_reward/std": 0.4653336703777313,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999030828475952,
+      "sampling/importance_sampling_ratio/min": 1.553593506287143e-06,
+      "sampling/sampling_logp_difference/max": 13.374939918518066,
+      "sampling/sampling_logp_difference/mean": 0.01795877143740654,
+      "step": 114
+    },
+    {
+      "clip_ratio/high_max": 2.9651660042873118e-05,
+      "clip_ratio/high_mean": 9.398806923854863e-06,
+      "clip_ratio/low_mean": 4.788733849636628e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.728614519284747e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14988.0,
+      "completions/mean_length": 4976.921875,
+      "completions/mean_terminated_length": 4608.95166015625,
+      "completions/min_length": 335.0,
+      "completions/min_terminated_length": 335.0,
+      "entropy": 0.8381234556436539,
+      "epoch": 0.10579576816927323,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.0037972736172378063,
+      "learning_rate": 1e-05,
+      "loss": 0.1244,
+      "num_tokens": 85625559.0,
+      "reward": 0.4765625,
+      "reward_std": 0.39082521200180054,
+      "rewards/accuracy_reward/mean": 0.4765625,
+      "rewards/accuracy_reward/std": 0.5014128684997559,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999970555305481,
+      "sampling/importance_sampling_ratio/min": 0.002990707289427519,
+      "sampling/sampling_logp_difference/max": 5.8122453689575195,
+      "sampling/sampling_logp_difference/mean": 0.01815030723810196,
+      "step": 115
+    },
+    {
+      "clip_ratio/high_max": 4.130592969886493e-06,
+      "clip_ratio/high_mean": 1.0326482424716232e-06,
+      "clip_ratio/low_mean": 1.6904315600640984e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 1.7936963843112608e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15984.0,
+      "completions/mean_length": 6307.2421875,
+      "completions/mean_terminated_length": 6065.400390625,
+      "completions/min_length": 823.0,
+      "completions/min_terminated_length": 823.0,
+      "entropy": 1.1176434755325317,
+      "epoch": 0.10671573137074516,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0012413962977007031,
+      "learning_rate": 1e-05,
+      "loss": 0.0146,
+      "num_tokens": 86453606.0,
+      "reward": 0.28125,
+      "reward_std": 0.2280253767967224,
+      "rewards/accuracy_reward/mean": 0.28125,
+      "rewards/accuracy_reward/std": 0.4513758420944214,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000064373016357,
+      "sampling/importance_sampling_ratio/min": 0.004730688873678446,
+      "sampling/sampling_logp_difference/max": 5.353684425354004,
+      "sampling/sampling_logp_difference/mean": 0.021790307015180588,
+      "step": 116
+    },
+    {
+      "clip_ratio/high_max": 1.3160772823539446e-05,
+      "clip_ratio/high_mean": 3.2901932058848615e-06,
+      "clip_ratio/low_mean": 3.582628983167524e-05,
+      "clip_ratio/low_min": 2.61966624748311e-06,
+      "clip_ratio/region_mean": 3.911648195753514e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16329.0,
+      "completions/mean_length": 7263.1640625,
+      "completions/mean_terminated_length": 7044.26416015625,
+      "completions/min_length": 48.0,
+      "completions/min_terminated_length": 48.0,
+      "entropy": 1.107876107096672,
+      "epoch": 0.10763569457221711,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0017762042116373777,
+      "learning_rate": 1e-05,
+      "loss": 0.0349,
+      "num_tokens": 87402763.0,
+      "reward": 0.2578125,
+      "reward_std": 0.27776598930358887,
+      "rewards/accuracy_reward/mean": 0.2578125,
+      "rewards/accuracy_reward/std": 0.43914902210235596,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999741315841675,
+      "sampling/importance_sampling_ratio/min": 0.0009408573969267309,
+      "sampling/sampling_logp_difference/max": 6.968719005584717,
+      "sampling/sampling_logp_difference/mean": 0.02103034406900406,
+      "step": 117
+    },
+    {
+      "clip_ratio/high_max": 3.987745776612428e-05,
+      "clip_ratio/high_mean": 1.1877163728968299e-05,
+      "clip_ratio/low_mean": 4.26799579145154e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.455712096136267e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15416.0,
+      "completions/mean_length": 5093.859375,
+      "completions/mean_terminated_length": 4914.65087890625,
+      "completions/min_length": 364.0,
+      "completions/min_terminated_length": 364.0,
+      "entropy": 1.1065888702869415,
+      "epoch": 0.10855565777368906,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0032127038575708866,
+      "learning_rate": 1e-05,
+      "loss": 0.0194,
+      "num_tokens": 88077385.0,
+      "reward": 0.421875,
+      "reward_std": 0.345874547958374,
+      "rewards/accuracy_reward/mean": 0.421875,
+      "rewards/accuracy_reward/std": 0.4957992732524872,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999171495437622,
+      "sampling/importance_sampling_ratio/min": 7.033879228401929e-05,
+      "sampling/sampling_logp_difference/max": 9.562187194824219,
+      "sampling/sampling_logp_difference/mean": 0.020314980298280716,
+      "step": 118
+    },
+    {
+      "clip_ratio/high_max": 9.35208754526684e-06,
+      "clip_ratio/high_mean": 4.4788730519940145e-06,
+      "clip_ratio/low_mean": 3.470697703278347e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.918584917528278e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15740.0,
+      "completions/mean_length": 6943.53125,
+      "completions/mean_terminated_length": 6639.0,
+      "completions/min_length": 307.0,
+      "completions/min_terminated_length": 307.0,
+      "entropy": 0.9009081721305847,
+      "epoch": 0.10947562097516099,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0028925195802003145,
+      "learning_rate": 1e-05,
+      "loss": 0.0862,
+      "num_tokens": 88985269.0,
+      "reward": 0.3984375,
+      "reward_std": 0.3535328209400177,
+      "rewards/accuracy_reward/mean": 0.3984375,
+      "rewards/accuracy_reward/std": 0.4915000796318054,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999980628490448,
+      "sampling/importance_sampling_ratio/min": 6.553035092338177e-08,
+      "sampling/sampling_logp_difference/max": 16.540752410888672,
+      "sampling/sampling_logp_difference/mean": 0.019378282129764557,
+      "step": 119
+    },
+    {
+      "clip_ratio/high_max": 1.0939961612166371e-05,
+      "clip_ratio/high_mean": 2.734990403041593e-06,
+      "clip_ratio/low_mean": 2.4615862798782473e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.7350853201824066e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15148.0,
+      "completions/max_terminated_length": 15148.0,
+      "completions/mean_length": 4976.25,
+      "completions/mean_terminated_length": 4976.25,
+      "completions/min_length": 702.0,
+      "completions/min_terminated_length": 702.0,
+      "entropy": 0.9463540017604828,
+      "epoch": 0.11039558417663294,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0017386430408805609,
+      "learning_rate": 1e-05,
+      "loss": 0.0215,
+      "num_tokens": 89645205.0,
+      "reward": 0.359375,
+      "reward_std": 0.26462042331695557,
+      "rewards/accuracy_reward/mean": 0.359375,
+      "rewards/accuracy_reward/std": 0.481702595949173,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999554753303528,
+      "sampling/importance_sampling_ratio/min": 7.889595508459024e-06,
+      "sampling/sampling_logp_difference/max": 11.74996566772461,
+      "sampling/sampling_logp_difference/mean": 0.018035830929875374,
+      "step": 120
+    },
+    {
+      "clip_ratio/high_max": 5.941629297012696e-06,
+      "clip_ratio/high_mean": 1.485407324253174e-06,
+      "clip_ratio/low_mean": 2.6826061798601586e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.8311469009167922e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15892.0,
+      "completions/mean_length": 6439.5390625,
+      "completions/mean_terminated_length": 6281.69091796875,
+      "completions/min_length": 959.0,
+      "completions/min_terminated_length": 959.0,
+      "entropy": 0.899876207113266,
+      "epoch": 0.11131554737810488,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0037381781730800867,
+      "learning_rate": 1e-05,
+      "loss": 0.0666,
+      "num_tokens": 90489394.0,
+      "reward": 0.3203125,
+      "reward_std": 0.2624938488006592,
+      "rewards/accuracy_reward/mean": 0.3203125,
+      "rewards/accuracy_reward/std": 0.4684300124645233,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999206066131592,
+      "sampling/importance_sampling_ratio/min": 0.003606764366850257,
+      "sampling/sampling_logp_difference/max": 5.62494421005249,
+      "sampling/sampling_logp_difference/mean": 0.019368179142475128,
+      "step": 121
+    },
+    {
+      "clip_ratio/high_max": 5.189952389628161e-06,
+      "clip_ratio/high_mean": 1.2974880974070402e-06,
+      "clip_ratio/low_mean": 3.058137212974543e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.187886022715247e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15979.0,
+      "completions/mean_length": 6876.46875,
+      "completions/mean_terminated_length": 6408.884765625,
+      "completions/min_length": 4.0,
+      "completions/min_terminated_length": 4.0,
+      "entropy": 1.1018569767475128,
+      "epoch": 0.11223551057957681,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0018562980694696307,
+      "learning_rate": 1e-05,
+      "loss": 0.095,
+      "num_tokens": 91390054.0,
+      "reward": 0.21875,
+      "reward_std": 0.29955869913101196,
+      "rewards/accuracy_reward/mean": 0.21875,
+      "rewards/accuracy_reward/std": 0.41502299904823303,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999849796295166,
+      "sampling/importance_sampling_ratio/min": 2.9343695132411085e-05,
+      "sampling/sampling_logp_difference/max": 10.436432838439941,
+      "sampling/sampling_logp_difference/mean": 0.020825792104005814,
+      "step": 122
+    },
+    {
+      "clip_ratio/high_max": 2.022083435804234e-05,
+      "clip_ratio/high_mean": 5.055208589510585e-06,
+      "clip_ratio/low_mean": 3.029032552603894e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.53455343429232e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14153.0,
+      "completions/mean_length": 6501.5078125,
+      "completions/mean_terminated_length": 6344.64306640625,
+      "completions/min_length": 720.0,
+      "completions/min_terminated_length": 720.0,
+      "entropy": 1.073579266667366,
+      "epoch": 0.11315547378104876,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0016695430967956781,
+      "learning_rate": 1e-05,
+      "loss": 0.0552,
+      "num_tokens": 92241535.0,
+      "reward": 0.2734375,
+      "reward_std": 0.28641316294670105,
+      "rewards/accuracy_reward/mean": 0.2734375,
+      "rewards/accuracy_reward/std": 0.447474867105484,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998984336853027,
+      "sampling/importance_sampling_ratio/min": 0.0002380236255703494,
+      "sampling/sampling_logp_difference/max": 8.343140602111816,
+      "sampling/sampling_logp_difference/mean": 0.020438479259610176,
+      "step": 123
+    },
+    {
+      "clip_ratio/high_max": 3.3911180707946187e-06,
+      "clip_ratio/high_mean": 8.477795176986547e-07,
+      "clip_ratio/low_mean": 2.2190370486896427e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.30381500614385e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 14345.0,
+      "completions/max_terminated_length": 14345.0,
+      "completions/mean_length": 5474.1328125,
+      "completions/mean_terminated_length": 5474.1328125,
+      "completions/min_length": 14.0,
+      "completions/min_terminated_length": 14.0,
+      "entropy": 1.0692576617002487,
+      "epoch": 0.1140754369825207,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0034909825772047043,
+      "learning_rate": 1e-05,
+      "loss": 0.0,
+      "num_tokens": 92962472.0,
+      "reward": 0.3046875,
+      "reward_std": 0.27564430236816406,
+      "rewards/accuracy_reward/mean": 0.3046875,
+      "rewards/accuracy_reward/std": 0.46208351850509644,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.000006079673767,
+      "sampling/importance_sampling_ratio/min": 0.0017851731972768903,
+      "sampling/sampling_logp_difference/max": 6.328239917755127,
+      "sampling/sampling_logp_difference/mean": 0.019930578768253326,
+      "step": 124
+    },
+    {
+      "clip_ratio/high_max": 2.6292200345778838e-05,
+      "clip_ratio/high_mean": 7.620442374900449e-06,
+      "clip_ratio/low_mean": 4.615546390596137e-05,
+      "clip_ratio/low_min": 1.366510537081922e-05,
+      "clip_ratio/region_mean": 5.3775906508235494e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16198.0,
+      "completions/mean_length": 7512.078125,
+      "completions/mean_terminated_length": 7225.88671875,
+      "completions/min_length": 486.0,
+      "completions/min_terminated_length": 486.0,
+      "entropy": 0.9676955863833427,
+      "epoch": 0.11499540018399264,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0023449272848665714,
+      "learning_rate": 1e-05,
+      "loss": 0.0454,
+      "num_tokens": 93950506.0,
+      "reward": 0.3203125,
+      "reward_std": 0.22461043298244476,
+      "rewards/accuracy_reward/mean": 0.3203125,
+      "rewards/accuracy_reward/std": 0.4684300124645233,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999359250068665,
+      "sampling/importance_sampling_ratio/min": 0.0016406332142651081,
+      "sampling/sampling_logp_difference/max": 6.412672996520996,
+      "sampling/sampling_logp_difference/mean": 0.020141655579209328,
+      "step": 125
+    },
+    {
+      "clip_ratio/high_max": 5.097255780128762e-06,
+      "clip_ratio/high_mean": 1.2743139450321905e-06,
+      "clip_ratio/low_mean": 3.3802551342887455e-05,
+      "clip_ratio/low_min": 4.146762421441963e-06,
+      "clip_ratio/region_mean": 3.5076865287919645e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16183.0,
+      "completions/mean_length": 6920.484375,
+      "completions/mean_terminated_length": 6693.3603515625,
+      "completions/min_length": 962.0,
+      "completions/min_terminated_length": 962.0,
+      "entropy": 0.8662540689110756,
+      "epoch": 0.11591536338546458,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0037103090435266495,
+      "learning_rate": 1e-05,
+      "loss": 0.0617,
+      "num_tokens": 94854016.0,
+      "reward": 0.4375,
+      "reward_std": 0.322716623544693,
+      "rewards/accuracy_reward/mean": 0.4375,
+      "rewards/accuracy_reward/std": 0.49802759289741516,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999761581420898,
+      "sampling/importance_sampling_ratio/min": 0.00047686786274425685,
+      "sampling/sampling_logp_difference/max": 7.648271083831787,
+      "sampling/sampling_logp_difference/mean": 0.01915796287357807,
+      "step": 126
+    },
+    {
+      "clip_ratio/high_max": 8.4922439782531e-06,
+      "clip_ratio/high_mean": 2.123060994563275e-06,
+      "clip_ratio/low_mean": 5.024227584726759e-05,
+      "clip_ratio/low_min": 1.3627016414829995e-05,
+      "clip_ratio/region_mean": 5.236533706920454e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15867.0,
+      "completions/mean_length": 7939.609375,
+      "completions/mean_terminated_length": 7805.57177734375,
+      "completions/min_length": 1260.0,
+      "completions/min_terminated_length": 1260.0,
+      "entropy": 0.9707008600234985,
+      "epoch": 0.11683532658693652,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0024642283096909523,
+      "learning_rate": 1e-05,
+      "loss": 0.0788,
+      "num_tokens": 95889966.0,
+      "reward": 0.2265625,
+      "reward_std": 0.27434611320495605,
+      "rewards/accuracy_reward/mean": 0.2265625,
+      "rewards/accuracy_reward/std": 0.4202519655227661,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998771548271179,
+      "sampling/importance_sampling_ratio/min": 4.540014560916461e-05,
+      "sampling/sampling_logp_difference/max": 9.999995231628418,
+      "sampling/sampling_logp_difference/mean": 0.020453302189707756,
+      "step": 127
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 3.766829564710861e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.766829564710861e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14969.0,
+      "completions/mean_length": 5985.8203125,
+      "completions/mean_terminated_length": 5474.43408203125,
+      "completions/min_length": 315.0,
+      "completions/min_terminated_length": 315.0,
+      "entropy": 0.9083090648055077,
+      "epoch": 0.11775528978840846,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.003317479742690921,
+      "learning_rate": 1e-05,
+      "loss": 0.0537,
+      "num_tokens": 96676847.0,
+      "reward": 0.3671875,
+      "reward_std": 0.287486732006073,
+      "rewards/accuracy_reward/mean": 0.3671875,
+      "rewards/accuracy_reward/std": 0.4839322865009308,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999130964279175,
+      "sampling/importance_sampling_ratio/min": 0.000286750087980181,
+      "sampling/sampling_logp_difference/max": 8.156899452209473,
+      "sampling/sampling_logp_difference/mean": 0.01996719278395176,
+      "step": 128
+    },
+    {
+      "clip_ratio/high_max": 1.8439853647578275e-05,
+      "clip_ratio/high_mean": 4.609963411894569e-06,
+      "clip_ratio/low_mean": 5.708034223061986e-05,
+      "clip_ratio/low_min": 2.75287948170444e-06,
+      "clip_ratio/region_mean": 6.169030598357494e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15081.0,
+      "completions/mean_length": 6565.359375,
+      "completions/mean_terminated_length": 6488.04736328125,
+      "completions/min_length": 248.0,
+      "completions/min_terminated_length": 248.0,
+      "entropy": 1.1013468354940414,
+      "epoch": 0.11867525298988041,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0019073591101914644,
+      "learning_rate": 1e-05,
+      "loss": 0.0622,
+      "num_tokens": 97539453.0,
+      "reward": 0.2734375,
+      "reward_std": 0.307217001914978,
+      "rewards/accuracy_reward/mean": 0.2734375,
+      "rewards/accuracy_reward/std": 0.447474867105484,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999555945396423,
+      "sampling/importance_sampling_ratio/min": 0.0006022047018632293,
+      "sampling/sampling_logp_difference/max": 7.414913177490234,
+      "sampling/sampling_logp_difference/mean": 0.02150837704539299,
+      "step": 129
+    },
+    {
+      "clip_ratio/high_max": 9.068485269381199e-06,
+      "clip_ratio/high_mean": 2.2671213173452998e-06,
+      "clip_ratio/low_mean": 1.9822365402433206e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.208948649240483e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16099.0,
+      "completions/mean_length": 6779.6171875,
+      "completions/mean_terminated_length": 6703.9921875,
+      "completions/min_length": 4.0,
+      "completions/min_terminated_length": 4.0,
+      "entropy": 0.8940552547574043,
+      "epoch": 0.11959521619135234,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0010163087863475084,
+      "learning_rate": 1e-05,
+      "loss": 0.0249,
+      "num_tokens": 98429036.0,
+      "reward": 0.453125,
+      "reward_std": 0.22567126154899597,
+      "rewards/accuracy_reward/mean": 0.453125,
+      "rewards/accuracy_reward/std": 0.4997538626194,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999485015869141,
+      "sampling/importance_sampling_ratio/min": 3.464699460664633e-08,
+      "sampling/sampling_logp_difference/max": 17.178054809570312,
+      "sampling/sampling_logp_difference/mean": 0.018716152757406235,
+      "step": 130
+    },
+    {
+      "clip_ratio/high_max": 5.047242211730918e-06,
+      "clip_ratio/high_mean": 1.2618105529327295e-06,
+      "clip_ratio/low_mean": 2.9014110396019532e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.0275920835265424e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 14549.0,
+      "completions/max_terminated_length": 14549.0,
+      "completions/mean_length": 5766.71875,
+      "completions/mean_terminated_length": 5766.71875,
+      "completions/min_length": 47.0,
+      "completions/min_terminated_length": 47.0,
+      "entropy": 1.0455922111868858,
+      "epoch": 0.12051517939282429,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.002155766822397709,
+      "learning_rate": 1e-05,
+      "loss": 0.0238,
+      "num_tokens": 99184264.0,
+      "reward": 0.4140625,
+      "reward_std": 0.3077537715435028,
+      "rewards/accuracy_reward/mean": 0.4140625,
+      "rewards/accuracy_reward/std": 0.49449479579925537,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999253749847412,
+      "sampling/importance_sampling_ratio/min": 0.00010798005678225309,
+      "sampling/sampling_logp_difference/max": 9.133563995361328,
+      "sampling/sampling_logp_difference/mean": 0.020948775112628937,
+      "step": 131
+    },
+    {
+      "clip_ratio/high_max": 2.0882574972347356e-05,
+      "clip_ratio/high_mean": 6.505383225885453e-06,
+      "clip_ratio/low_mean": 4.496008500609605e-05,
+      "clip_ratio/low_min": 7.757854064038838e-06,
+      "clip_ratio/region_mean": 5.1465468231981504e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14704.0,
+      "completions/mean_length": 6167.2421875,
+      "completions/mean_terminated_length": 6005.07177734375,
+      "completions/min_length": 218.0,
+      "completions/min_terminated_length": 218.0,
+      "entropy": 0.9100174158811569,
+      "epoch": 0.12143514259429623,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.0021464223973453045,
+      "learning_rate": 1e-05,
+      "loss": -0.0279,
+      "num_tokens": 99996831.0,
+      "reward": 0.421875,
+      "reward_std": 0.3916535973548889,
+      "rewards/accuracy_reward/mean": 0.421875,
+      "rewards/accuracy_reward/std": 0.4957992732524872,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999240040779114,
+      "sampling/importance_sampling_ratio/min": 0.02249590866267681,
+      "sampling/sampling_logp_difference/max": 3.794421911239624,
+      "sampling/sampling_logp_difference/mean": 0.01866895705461502,
+      "step": 132
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 3.0998018473837874e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.0998018473837874e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15738.0,
+      "completions/mean_length": 6242.9453125,
+      "completions/mean_terminated_length": 6163.09423828125,
+      "completions/min_length": 1187.0,
+      "completions/min_terminated_length": 1187.0,
+      "entropy": 0.8624134212732315,
+      "epoch": 0.12235510579576817,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0023277695290744305,
+      "learning_rate": 1e-05,
+      "loss": 0.0524,
+      "num_tokens": 100814112.0,
+      "reward": 0.3984375,
+      "reward_std": 0.2869499623775482,
+      "rewards/accuracy_reward/mean": 0.3984375,
+      "rewards/accuracy_reward/std": 0.4915000796318054,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999959409236908,
+      "sampling/importance_sampling_ratio/min": 0.0002393616596236825,
+      "sampling/sampling_logp_difference/max": 8.33753490447998,
+      "sampling/sampling_logp_difference/mean": 0.0191188994795084,
+      "step": 133
+    },
+    {
+      "clip_ratio/high_max": 6.589872555196052e-06,
+      "clip_ratio/high_mean": 1.647468138799013e-06,
+      "clip_ratio/low_mean": 4.329304238126497e-05,
+      "clip_ratio/low_min": 3.5120251595799346e-06,
+      "clip_ratio/region_mean": 4.494051017900347e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14866.0,
+      "completions/mean_length": 5733.6875,
+      "completions/mean_terminated_length": 5478.080078125,
+      "completions/min_length": 789.0,
+      "completions/min_terminated_length": 789.0,
+      "entropy": 0.9628067463636398,
+      "epoch": 0.12327506899724011,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.003547821193933487,
+      "learning_rate": 1e-05,
+      "loss": 0.0321,
+      "num_tokens": 101566264.0,
+      "reward": 0.3984375,
+      "reward_std": 0.36584997177124023,
+      "rewards/accuracy_reward/mean": 0.3984375,
+      "rewards/accuracy_reward/std": 0.4915000796318054,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999400973320007,
+      "sampling/importance_sampling_ratio/min": 0.0001282035664189607,
+      "sampling/sampling_logp_difference/max": 8.961891174316406,
+      "sampling/sampling_logp_difference/mean": 0.019646761938929558,
+      "step": 134
+    },
+    {
+      "clip_ratio/high_max": 1.7107527582993498e-05,
+      "clip_ratio/high_mean": 4.2768818957483745e-06,
+      "clip_ratio/low_mean": 3.014796902789385e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.442485103732906e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15848.0,
+      "completions/max_terminated_length": 15848.0,
+      "completions/mean_length": 5505.9375,
+      "completions/mean_terminated_length": 5505.9375,
+      "completions/min_length": 668.0,
+      "completions/min_terminated_length": 668.0,
+      "entropy": 0.8041045889258385,
+      "epoch": 0.12419503219871206,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0024891747161746025,
+      "learning_rate": 1e-05,
+      "loss": 0.1406,
+      "num_tokens": 102291456.0,
+      "reward": 0.5,
+      "reward_std": 0.35482609272003174,
+      "rewards/accuracy_reward/mean": 0.5,
+      "rewards/accuracy_reward/std": 0.5019646286964417,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999248385429382,
+      "sampling/importance_sampling_ratio/min": 0.0014627616619691253,
+      "sampling/sampling_logp_difference/max": 6.527429103851318,
+      "sampling/sampling_logp_difference/mean": 0.01716250739991665,
+      "step": 135
+    },
+    {
+      "clip_ratio/high_max": 1.548903105685895e-05,
+      "clip_ratio/high_mean": 3.872257764214737e-06,
+      "clip_ratio/low_mean": 5.380711581892683e-05,
+      "clip_ratio/low_min": 4.5777483137499075e-06,
+      "clip_ratio/region_mean": 5.767937363998499e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 16005.0,
+      "completions/max_terminated_length": 16005.0,
+      "completions/mean_length": 5003.0625,
+      "completions/mean_terminated_length": 5003.0625,
+      "completions/min_length": 497.0,
+      "completions/min_terminated_length": 497.0,
+      "entropy": 0.9115714654326439,
+      "epoch": 0.125114995400184,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.00220683915540576,
+      "learning_rate": 1e-05,
+      "loss": 0.1361,
+      "num_tokens": 102949824.0,
+      "reward": 0.4140625,
+      "reward_std": 0.3158818483352661,
+      "rewards/accuracy_reward/mean": 0.4140625,
+      "rewards/accuracy_reward/std": 0.49449479579925537,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999973714351654,
+      "sampling/importance_sampling_ratio/min": 8.323705696966499e-05,
+      "sampling/sampling_logp_difference/max": 9.393817901611328,
+      "sampling/sampling_logp_difference/mean": 0.018076512962579727,
+      "step": 136
+    },
+    {
+      "clip_ratio/high_max": 2.181136096623959e-05,
+      "clip_ratio/high_mean": 5.4528402415598975e-06,
+      "clip_ratio/low_mean": 3.4416837252138066e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.986967681157694e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15658.0,
+      "completions/max_terminated_length": 15658.0,
+      "completions/mean_length": 4742.1328125,
+      "completions/mean_terminated_length": 4742.1328125,
+      "completions/min_length": 462.0,
+      "completions/min_terminated_length": 462.0,
+      "entropy": 0.9430246204137802,
+      "epoch": 0.12603495860165592,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.003964806906878948,
+      "learning_rate": 1e-05,
+      "loss": 0.0215,
+      "num_tokens": 103580913.0,
+      "reward": 0.4609375,
+      "reward_std": 0.2914257347583771,
+      "rewards/accuracy_reward/mean": 0.4609375,
+      "rewards/accuracy_reward/std": 0.5004304051399231,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999952495098114,
+      "sampling/importance_sampling_ratio/min": 7.031940185697749e-05,
+      "sampling/sampling_logp_difference/max": 9.56246280670166,
+      "sampling/sampling_logp_difference/mean": 0.019651200622320175,
+      "step": 137
+    },
+    {
+      "clip_ratio/high_max": 4.07684046876966e-06,
+      "clip_ratio/high_mean": 1.019210117192415e-06,
+      "clip_ratio/low_mean": 3.8682398553646635e-05,
+      "clip_ratio/low_min": 8.189203072106466e-06,
+      "clip_ratio/region_mean": 3.970160832977854e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15944.0,
+      "completions/mean_length": 6574.171875,
+      "completions/mean_terminated_length": 6091.72119140625,
+      "completions/min_length": 10.0,
+      "completions/min_terminated_length": 10.0,
+      "entropy": 0.8429529070854187,
+      "epoch": 0.12695492180312787,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.002067410387098789,
+      "learning_rate": 1e-05,
+      "loss": 0.0377,
+      "num_tokens": 104447463.0,
+      "reward": 0.3125,
+      "reward_std": 0.24511480331420898,
+      "rewards/accuracy_reward/mean": 0.3125,
+      "rewards/accuracy_reward/std": 0.4653336703777313,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9997583627700806,
+      "sampling/importance_sampling_ratio/min": 0.00021258489869069308,
+      "sampling/sampling_logp_difference/max": 8.456169128417969,
+      "sampling/sampling_logp_difference/mean": 0.018853647634387016,
+      "step": 138
+    },
+    {
+      "clip_ratio/high_max": 1.9725823221961036e-05,
+      "clip_ratio/high_mean": 4.931455805490259e-06,
+      "clip_ratio/low_mean": 5.9263072444082354e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 6.419452870431996e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15518.0,
+      "completions/max_terminated_length": 15518.0,
+      "completions/mean_length": 4581.5625,
+      "completions/mean_terminated_length": 4581.5625,
+      "completions/min_length": 301.0,
+      "completions/min_terminated_length": 301.0,
+      "entropy": 0.7094272822141647,
+      "epoch": 0.12787488500459981,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.004292502999305725,
+      "learning_rate": 1e-05,
+      "loss": 0.0946,
+      "num_tokens": 105052287.0,
+      "reward": 0.625,
+      "reward_std": 0.3908300995826721,
+      "rewards/accuracy_reward/mean": 0.625,
+      "rewards/accuracy_reward/std": 0.4860251843929291,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999477863311768,
+      "sampling/importance_sampling_ratio/min": 0.0019342642044648528,
+      "sampling/sampling_logp_difference/max": 6.24802827835083,
+      "sampling/sampling_logp_difference/mean": 0.016310662031173706,
+      "step": 139
+    },
+    {
+      "clip_ratio/high_max": 1.0132298029930098e-05,
+      "clip_ratio/high_mean": 2.5330745074825245e-06,
+      "clip_ratio/low_mean": 4.6397121650443296e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.893019581686531e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16097.0,
+      "completions/mean_length": 7066.4453125,
+      "completions/mean_terminated_length": 6918.5478515625,
+      "completions/min_length": 990.0,
+      "completions/min_terminated_length": 990.0,
+      "entropy": 0.8481669947504997,
+      "epoch": 0.12879484820607176,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0015785128343850374,
+      "learning_rate": 1e-05,
+      "loss": 0.0485,
+      "num_tokens": 105977048.0,
+      "reward": 0.3515625,
+      "reward_std": 0.27328038215637207,
+      "rewards/accuracy_reward/mean": 0.3515625,
+      "rewards/accuracy_reward/std": 0.4793342351913452,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000159740447998,
+      "sampling/importance_sampling_ratio/min": 0.00104097044095397,
+      "sampling/sampling_logp_difference/max": 6.8676018714904785,
+      "sampling/sampling_logp_difference/mean": 0.018304405733942986,
+      "step": 140
+    },
+    {
+      "clip_ratio/high_max": 1.6989023606583942e-05,
+      "clip_ratio/high_mean": 4.2472559016459854e-06,
+      "clip_ratio/low_mean": 2.3075059743860038e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.7322315418132348e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 16104.0,
+      "completions/max_terminated_length": 16104.0,
+      "completions/mean_length": 6230.5234375,
+      "completions/mean_terminated_length": 6230.5234375,
+      "completions/min_length": 220.0,
+      "completions/min_terminated_length": 220.0,
+      "entropy": 0.9658062160015106,
+      "epoch": 0.1297148114075437,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.002542720176279545,
+      "learning_rate": 1e-05,
+      "loss": 0.0725,
+      "num_tokens": 106793187.0,
+      "reward": 0.3203125,
+      "reward_std": 0.3050953149795532,
+      "rewards/accuracy_reward/mean": 0.3203125,
+      "rewards/accuracy_reward/std": 0.4684300124645233,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000169277191162,
+      "sampling/importance_sampling_ratio/min": 0.0002781494113150984,
+      "sampling/sampling_logp_difference/max": 8.187352180480957,
+      "sampling/sampling_logp_difference/mean": 0.019391046836972237,
+      "step": 141
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 2.7597974508353218e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.7597974508353218e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14216.0,
+      "completions/mean_length": 5690.5546875,
+      "completions/mean_terminated_length": 5606.3544921875,
+      "completions/min_length": 1124.0,
+      "completions/min_terminated_length": 1124.0,
+      "entropy": 1.0098655670881271,
+      "epoch": 0.13063477460901565,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.001451602904126048,
+      "learning_rate": 1e-05,
+      "loss": 0.0444,
+      "num_tokens": 107539874.0,
+      "reward": 0.4296875,
+      "reward_std": 0.23304283618927002,
+      "rewards/accuracy_reward/mean": 0.4296875,
+      "rewards/accuracy_reward/std": 0.4969765841960907,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999307990074158,
+      "sampling/importance_sampling_ratio/min": 5.640022671116185e-09,
+      "sampling/sampling_logp_difference/max": 18.993377685546875,
+      "sampling/sampling_logp_difference/mean": 0.018607191741466522,
+      "step": 142
+    },
+    {
+      "clip_ratio/high_max": 1.2800467629858758e-05,
+      "clip_ratio/high_mean": 4.19954119479371e-06,
+      "clip_ratio/low_mean": 2.350350996493944e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.770305115973315e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15791.0,
+      "completions/max_terminated_length": 15791.0,
+      "completions/mean_length": 5471.1328125,
+      "completions/mean_terminated_length": 5471.1328125,
+      "completions/min_length": 4.0,
+      "completions/min_terminated_length": 4.0,
+      "entropy": 1.0413162112236023,
+      "epoch": 0.13155473781048757,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0023549250327050686,
+      "learning_rate": 1e-05,
+      "loss": 0.0411,
+      "num_tokens": 108260091.0,
+      "reward": 0.3203125,
+      "reward_std": 0.30061954259872437,
+      "rewards/accuracy_reward/mean": 0.3203125,
+      "rewards/accuracy_reward/std": 0.4684300124645233,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999832510948181,
+      "sampling/importance_sampling_ratio/min": 0.0011709182290360332,
+      "sampling/sampling_logp_difference/max": 6.749967098236084,
+      "sampling/sampling_logp_difference/mean": 0.020427243784070015,
+      "step": 143
+    },
+    {
+      "clip_ratio/high_max": 2.1983064925734652e-05,
+      "clip_ratio/high_mean": 5.495766231433663e-06,
+      "clip_ratio/low_mean": 4.361141452591255e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.9107180757346214e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16277.0,
+      "completions/mean_length": 6211.7421875,
+      "completions/mean_terminated_length": 6050.2783203125,
+      "completions/min_length": 622.0,
+      "completions/min_terminated_length": 622.0,
+      "entropy": 0.9706784337759018,
+      "epoch": 0.13247470101195952,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0017527056625112891,
+      "learning_rate": 1e-05,
+      "loss": 0.0686,
+      "num_tokens": 109073890.0,
+      "reward": 0.421875,
+      "reward_std": 0.29826050996780396,
+      "rewards/accuracy_reward/mean": 0.421875,
+      "rewards/accuracy_reward/std": 0.4957992732524872,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999092221260071,
+      "sampling/importance_sampling_ratio/min": 0.002898645820096135,
+      "sampling/sampling_logp_difference/max": 5.843511581420898,
+      "sampling/sampling_logp_difference/mean": 0.018898162990808487,
+      "step": 144
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 4.208964992358233e-05,
+      "clip_ratio/low_min": 3.9168990042526275e-06,
+      "clip_ratio/region_mean": 4.208964992358233e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14880.0,
+      "completions/mean_length": 6007.8984375,
+      "completions/mean_terminated_length": 5926.19677734375,
+      "completions/min_length": 156.0,
+      "completions/min_terminated_length": 156.0,
+      "entropy": 1.1967609524726868,
+      "epoch": 0.13339466421343146,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0007858420140109956,
+      "learning_rate": 1e-05,
+      "loss": 0.011,
+      "num_tokens": 109861813.0,
+      "reward": 0.296875,
+      "reward_std": 0.23486506938934326,
+      "rewards/accuracy_reward/mean": 0.296875,
+      "rewards/accuracy_reward/std": 0.45867621898651123,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999340772628784,
+      "sampling/importance_sampling_ratio/min": 3.294382011631569e-08,
+      "sampling/sampling_logp_difference/max": 17.22846221923828,
+      "sampling/sampling_logp_difference/mean": 0.021845955401659012,
+      "step": 145
+    },
+    {
+      "clip_ratio/high_max": 4.5118208618077915e-06,
+      "clip_ratio/high_mean": 1.1279552154519479e-06,
+      "clip_ratio/low_mean": 3.749712686840212e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.8625082197540905e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15838.0,
+      "completions/mean_length": 6800.9921875,
+      "completions/mean_terminated_length": 6725.53564453125,
+      "completions/min_length": 5.0,
+      "completions/min_terminated_length": 5.0,
+      "entropy": 1.0437887012958527,
+      "epoch": 0.1343146274149034,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0029428249690681696,
+      "learning_rate": 1e-05,
+      "loss": 0.0405,
+      "num_tokens": 110756572.0,
+      "reward": 0.265625,
+      "reward_std": 0.3248382806777954,
+      "rewards/accuracy_reward/mean": 0.265625,
+      "rewards/accuracy_reward/std": 0.44340085983276367,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999890327453613,
+      "sampling/importance_sampling_ratio/min": 0.0006329434108920395,
+      "sampling/sampling_logp_difference/max": 7.365129470825195,
+      "sampling/sampling_logp_difference/mean": 0.02010120078921318,
+      "step": 146
+    },
+    {
+      "clip_ratio/high_max": 1.427700522071973e-05,
+      "clip_ratio/high_mean": 3.5692513051799324e-06,
+      "clip_ratio/low_mean": 4.964020990883e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.320946092979284e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15834.0,
+      "completions/mean_length": 6309.4453125,
+      "completions/mean_terminated_length": 6230.1181640625,
+      "completions/min_length": 283.0,
+      "completions/min_terminated_length": 283.0,
+      "entropy": 0.9768906533718109,
+      "epoch": 0.13523459061637536,
+      "frac_reward_zero_std": 0.125,
+      "grad_norm": 0.002088683657348156,
+      "learning_rate": 1e-05,
+      "loss": 0.0316,
+      "num_tokens": 111585493.0,
+      "reward": 0.375,
+      "reward_std": 0.39796435832977295,
+      "rewards/accuracy_reward/mean": 0.375,
+      "rewards/accuracy_reward/std": 0.4860251843929291,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.000007152557373,
+      "sampling/importance_sampling_ratio/min": 0.009723234921693802,
+      "sampling/sampling_logp_difference/max": 4.633236885070801,
+      "sampling/sampling_logp_difference/mean": 0.020927833393216133,
+      "step": 147
+    },
+    {
+      "clip_ratio/high_max": 5.4841398196003865e-06,
+      "clip_ratio/high_mean": 1.3710349549000966e-06,
+      "clip_ratio/low_mean": 5.122006064084417e-05,
+      "clip_ratio/low_min": 3.785125954891555e-06,
+      "clip_ratio/region_mean": 5.25910957094311e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15209.0,
+      "completions/mean_length": 6221.859375,
+      "completions/mean_terminated_length": 6060.5556640625,
+      "completions/min_length": 83.0,
+      "completions/min_terminated_length": 83.0,
+      "entropy": 0.9212924689054489,
+      "epoch": 0.13615455381784727,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.002406956860795617,
+      "learning_rate": 1e-05,
+      "loss": 0.1051,
+      "num_tokens": 112400363.0,
+      "reward": 0.40625,
+      "reward_std": 0.31929677724838257,
+      "rewards/accuracy_reward/mean": 0.40625,
+      "rewards/accuracy_reward/std": 0.4930621087551117,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999701976776123,
+      "sampling/importance_sampling_ratio/min": 5.8308287407271564e-05,
+      "sampling/sampling_logp_difference/max": 9.74976634979248,
+      "sampling/sampling_logp_difference/mean": 0.018652018159627914,
+      "step": 148
+    },
+    {
+      "clip_ratio/high_max": 1.4568151755156578e-05,
+      "clip_ratio/high_mean": 3.6420379387891444e-06,
+      "clip_ratio/low_mean": 3.999794398623635e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.3639981413434725e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14997.0,
+      "completions/mean_length": 6942.8203125,
+      "completions/mean_terminated_length": 6716.232421875,
+      "completions/min_length": 200.0,
+      "completions/min_terminated_length": 200.0,
+      "entropy": 0.949538916349411,
+      "epoch": 0.13707451701931922,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0022962254006415606,
+      "learning_rate": 1e-05,
+      "loss": 0.0625,
+      "num_tokens": 113308748.0,
+      "reward": 0.375,
+      "reward_std": 0.3329663872718811,
+      "rewards/accuracy_reward/mean": 0.375,
+      "rewards/accuracy_reward/std": 0.4860251843929291,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999334812164307,
+      "sampling/importance_sampling_ratio/min": 0.00048810525913722813,
+      "sampling/sampling_logp_difference/max": 7.624979496002197,
+      "sampling/sampling_logp_difference/mean": 0.01939917355775833,
+      "step": 149
+    },
+    {
+      "clip_ratio/high_max": 8.786732450971613e-06,
+      "clip_ratio/high_mean": 2.196683112742903e-06,
+      "clip_ratio/low_mean": 5.562954720517155e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.7826231113722315e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15182.0,
+      "completions/mean_length": 6783.1796875,
+      "completions/mean_terminated_length": 6552.76025390625,
+      "completions/min_length": 7.0,
+      "completions/min_terminated_length": 7.0,
+      "entropy": 0.9774708449840546,
+      "epoch": 0.13799448022079117,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0020560629200190306,
+      "learning_rate": 1e-05,
+      "loss": 0.0473,
+      "num_tokens": 114196235.0,
+      "reward": 0.34375,
+      "reward_std": 0.3066929578781128,
+      "rewards/accuracy_reward/mean": 0.34375,
+      "rewards/accuracy_reward/std": 0.47682511806488037,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998990297317505,
+      "sampling/importance_sampling_ratio/min": 2.4757892447269114e-07,
+      "sampling/sampling_logp_difference/max": 15.211536407470703,
+      "sampling/sampling_logp_difference/mean": 0.019691556692123413,
+      "step": 150
+    },
+    {
+      "clip_ratio/high_max": 1.799483243303257e-05,
+      "clip_ratio/high_mean": 4.498708108258143e-06,
+      "clip_ratio/low_mean": 2.6389980291696702e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.0888688343111426e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15549.0,
+      "completions/mean_length": 5568.15625,
+      "completions/mean_terminated_length": 5396.4765625,
+      "completions/min_length": 271.0,
+      "completions/min_terminated_length": 271.0,
+      "entropy": 0.9303529411554337,
+      "epoch": 0.1389144434222631,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0022214846685528755,
+      "learning_rate": 1e-05,
+      "loss": 0.0187,
+      "num_tokens": 114928047.0,
+      "reward": 0.234375,
+      "reward_std": 0.2585597634315491,
+      "rewards/accuracy_reward/mean": 0.234375,
+      "rewards/accuracy_reward/std": 0.42527204751968384,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999408721923828,
+      "sampling/importance_sampling_ratio/min": 2.1446083337650634e-05,
+      "sampling/sampling_logp_difference/max": 10.749968528747559,
+      "sampling/sampling_logp_difference/mean": 0.01938418298959732,
+      "step": 151
+    },
+    {
+      "clip_ratio/high_max": 1.1957493370573502e-05,
+      "clip_ratio/high_mean": 2.9893733426433755e-06,
+      "clip_ratio/low_mean": 5.885063319510664e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 6.184000585562899e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15340.0,
+      "completions/max_terminated_length": 15340.0,
+      "completions/mean_length": 6086.578125,
+      "completions/mean_terminated_length": 6086.578125,
+      "completions/min_length": 919.0,
+      "completions/min_terminated_length": 919.0,
+      "entropy": 0.9131873697042465,
+      "epoch": 0.13983440662373506,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.002448044717311859,
+      "learning_rate": 1e-05,
+      "loss": 0.0599,
+      "num_tokens": 115725657.0,
+      "reward": 0.40625,
+      "reward_std": 0.35878273844718933,
+      "rewards/accuracy_reward/mean": 0.40625,
+      "rewards/accuracy_reward/std": 0.4930621087551117,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999779462814331,
+      "sampling/importance_sampling_ratio/min": 0.02929726243019104,
+      "sampling/sampling_logp_difference/max": 3.530261278152466,
+      "sampling/sampling_logp_difference/mean": 0.019298439845442772,
+      "step": 152
+    },
+    {
+      "clip_ratio/high_max": 1.3385357760853367e-05,
+      "clip_ratio/high_mean": 3.3463394402133417e-06,
+      "clip_ratio/low_mean": 5.717015119444113e-05,
+      "clip_ratio/low_min": 3.4328400033700746e-06,
+      "clip_ratio/region_mean": 6.0516490520967636e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15987.0,
+      "completions/mean_length": 6442.5390625,
+      "completions/mean_terminated_length": 6203.9443359375,
+      "completions/min_length": 574.0,
+      "completions/min_terminated_length": 574.0,
+      "entropy": 0.8959419652819633,
+      "epoch": 0.140754369825207,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.002013204852119088,
+      "learning_rate": 1e-05,
+      "loss": 0.0281,
+      "num_tokens": 116571478.0,
+      "reward": 0.2734375,
+      "reward_std": 0.2909066081047058,
+      "rewards/accuracy_reward/mean": 0.2734375,
+      "rewards/accuracy_reward/std": 0.447474867105484,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.000044584274292,
+      "sampling/importance_sampling_ratio/min": 1.0374163821325055e-06,
+      "sampling/sampling_logp_difference/max": 13.778777122497559,
+      "sampling/sampling_logp_difference/mean": 0.01925014518201351,
+      "step": 153
+    },
+    {
+      "clip_ratio/high_max": 9.34224021875707e-06,
+      "clip_ratio/high_mean": 3.136903728773177e-06,
+      "clip_ratio/low_mean": 2.9738095065567904e-05,
+      "clip_ratio/low_min": 3.7240065466903616e-06,
+      "clip_ratio/region_mean": 3.2874999135401595e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15946.0,
+      "completions/mean_length": 6633.5703125,
+      "completions/mean_terminated_length": 6319.0400390625,
+      "completions/min_length": 358.0,
+      "completions/min_terminated_length": 358.0,
+      "entropy": 1.0223619118332863,
+      "epoch": 0.14167433302667892,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0024523327592760324,
+      "learning_rate": 1e-05,
+      "loss": 0.056,
+      "num_tokens": 117440743.0,
+      "reward": 0.3203125,
+      "reward_std": 0.30061954259872437,
+      "rewards/accuracy_reward/mean": 0.3203125,
+      "rewards/accuracy_reward/std": 0.4684300124645233,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999213218688965,
+      "sampling/importance_sampling_ratio/min": 3.0026931199245155e-05,
+      "sampling/sampling_logp_difference/max": 10.413415908813477,
+      "sampling/sampling_logp_difference/mean": 0.02061290666460991,
+      "step": 154
+    },
+    {
+      "clip_ratio/high_max": 1.4537483366439119e-05,
+      "clip_ratio/high_mean": 3.6343708416097797e-06,
+      "clip_ratio/low_mean": 3.954866042477079e-05,
+      "clip_ratio/low_min": 9.874949228105834e-06,
+      "clip_ratio/region_mean": 4.318303126638057e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15919.0,
+      "completions/mean_length": 7183.0,
+      "completions/mean_terminated_length": 6886.193359375,
+      "completions/min_length": 357.0,
+      "completions/min_terminated_length": 357.0,
+      "entropy": 0.9815369099378586,
+      "epoch": 0.14259429622815087,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0018688985146582127,
+      "learning_rate": 1e-05,
+      "loss": 0.0395,
+      "num_tokens": 118380687.0,
+      "reward": 0.2890625,
+      "reward_std": 0.2498900145292282,
+      "rewards/accuracy_reward/mean": 0.2890625,
+      "rewards/accuracy_reward/std": 0.45510825514793396,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999039173126221,
+      "sampling/importance_sampling_ratio/min": 1.3847662557964213e-05,
+      "sampling/sampling_logp_difference/max": 11.187394142150879,
+      "sampling/sampling_logp_difference/mean": 0.019792160019278526,
+      "step": 155
+    },
+    {
+      "clip_ratio/high_max": 7.165636361605721e-06,
+      "clip_ratio/high_mean": 1.7914090904014301e-06,
+      "clip_ratio/low_mean": 4.9011068711024564e-05,
+      "clip_ratio/low_min": 1.0991705721608014e-05,
+      "clip_ratio/region_mean": 5.0802477687739156e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16246.0,
+      "completions/mean_length": 6324.640625,
+      "completions/mean_terminated_length": 5829.91748046875,
+      "completions/min_length": 45.0,
+      "completions/min_terminated_length": 45.0,
+      "entropy": 0.852975606918335,
+      "epoch": 0.14351425942962281,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.002005894435569644,
+      "learning_rate": 1e-05,
+      "loss": 0.0041,
+      "num_tokens": 119207089.0,
+      "reward": 0.3984375,
+      "reward_std": 0.2858940362930298,
+      "rewards/accuracy_reward/mean": 0.3984375,
+      "rewards/accuracy_reward/std": 0.4915000796318054,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000035762786865,
+      "sampling/importance_sampling_ratio/min": 5.788659223071591e-07,
+      "sampling/sampling_logp_difference/max": 14.362195014953613,
+      "sampling/sampling_logp_difference/mean": 0.01853565312922001,
+      "step": 156
+    },
+    {
+      "clip_ratio/high_max": 7.795394822096569e-06,
+      "clip_ratio/high_mean": 1.948848705524142e-06,
+      "clip_ratio/low_mean": 3.834237736555224e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.0291225786859286e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16281.0,
+      "completions/mean_length": 5723.421875,
+      "completions/mean_terminated_length": 5290.06494140625,
+      "completions/min_length": 355.0,
+      "completions/min_terminated_length": 355.0,
+      "entropy": 0.8744911625981331,
+      "epoch": 0.14443422263109476,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.002577397273853421,
+      "learning_rate": 1e-05,
+      "loss": 0.0603,
+      "num_tokens": 119961895.0,
+      "reward": 0.390625,
+      "reward_std": 0.34321609139442444,
+      "rewards/accuracy_reward/mean": 0.390625,
+      "rewards/accuracy_reward/std": 0.4898075461387634,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999703764915466,
+      "sampling/importance_sampling_ratio/min": 0.07882421463727951,
+      "sampling/sampling_logp_difference/max": 2.5405349731445312,
+      "sampling/sampling_logp_difference/mean": 0.018341556191444397,
+      "step": 157
+    },
+    {
+      "clip_ratio/high_max": 9.214097190124448e-06,
+      "clip_ratio/high_mean": 2.303524297531112e-06,
+      "clip_ratio/low_mean": 2.636873176697918e-05,
+      "clip_ratio/low_min": 2.9339967113628518e-06,
+      "clip_ratio/region_mean": 2.8672255837136618e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16055.0,
+      "completions/mean_length": 7886.015625,
+      "completions/mean_terminated_length": 7682.064453125,
+      "completions/min_length": 989.0,
+      "completions/min_terminated_length": 989.0,
+      "entropy": 0.9391767829656601,
+      "epoch": 0.1453541858325667,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.002552987542003393,
+      "learning_rate": 1e-05,
+      "loss": 0.0273,
+      "num_tokens": 120990289.0,
+      "reward": 0.328125,
+      "reward_std": 0.2522490322589874,
+      "rewards/accuracy_reward/mean": 0.328125,
+      "rewards/accuracy_reward/std": 0.4713755249977112,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000030994415283,
+      "sampling/importance_sampling_ratio/min": 0.000899312668479979,
+      "sampling/sampling_logp_difference/max": 7.013879776000977,
+      "sampling/sampling_logp_difference/mean": 0.02049873024225235,
+      "step": 158
+    },
+    {
+      "clip_ratio/high_max": 3.406416203688423e-05,
+      "clip_ratio/high_mean": 9.72330332160709e-06,
+      "clip_ratio/low_mean": 3.168332909808669e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.140663151019908e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16276.0,
+      "completions/mean_length": 6173.1640625,
+      "completions/mean_terminated_length": 6011.087890625,
+      "completions/min_length": 445.0,
+      "completions/min_terminated_length": 445.0,
+      "entropy": 0.9148785546422005,
+      "epoch": 0.14627414903403863,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.002678362652659416,
+      "learning_rate": 1e-05,
+      "loss": 0.039,
+      "num_tokens": 121797958.0,
+      "reward": 0.4140625,
+      "reward_std": 0.3608373999595642,
+      "rewards/accuracy_reward/mean": 0.4140625,
+      "rewards/accuracy_reward/std": 0.49449479579925537,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999265074729919,
+      "sampling/importance_sampling_ratio/min": 0.002013920107856393,
+      "sampling/sampling_logp_difference/max": 6.207672119140625,
+      "sampling/sampling_logp_difference/mean": 0.018977735191583633,
+      "step": 159
+    },
+    {
+      "clip_ratio/high_max": 1.8476588593330234e-05,
+      "clip_ratio/high_mean": 4.6191471483325586e-06,
+      "clip_ratio/low_mean": 4.459614581264759e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.9215293188353826e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15825.0,
+      "completions/mean_length": 6594.21875,
+      "completions/mean_terminated_length": 6196.259765625,
+      "completions/min_length": 196.0,
+      "completions/min_terminated_length": 196.0,
+      "entropy": 0.9486038386821747,
+      "epoch": 0.14719411223551057,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0033711253199726343,
+      "learning_rate": 1e-05,
+      "loss": 0.026,
+      "num_tokens": 122661170.0,
+      "reward": 0.3828125,
+      "reward_std": 0.30457615852355957,
+      "rewards/accuracy_reward/mean": 0.3828125,
+      "rewards/accuracy_reward/std": 0.4879830479621887,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998981356620789,
+      "sampling/importance_sampling_ratio/min": 0.0002968419576063752,
+      "sampling/sampling_logp_difference/max": 8.122310638427734,
+      "sampling/sampling_logp_difference/mean": 0.01938377134501934,
+      "step": 160
+    },
+    {
+      "clip_ratio/high_max": 7.97335997049231e-06,
+      "clip_ratio/high_mean": 2.7343705824023345e-06,
+      "clip_ratio/low_mean": 5.420079878604156e-05,
+      "clip_ratio/low_min": 4.594068286678521e-06,
+      "clip_ratio/region_mean": 5.693517005056492e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15928.0,
+      "completions/mean_length": 6533.9453125,
+      "completions/mean_terminated_length": 6377.595703125,
+      "completions/min_length": 524.0,
+      "completions/min_terminated_length": 524.0,
+      "entropy": 0.9986584335565567,
+      "epoch": 0.14811407543698252,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0017857529455795884,
+      "learning_rate": 1e-05,
+      "loss": 0.0804,
+      "num_tokens": 123518107.0,
+      "reward": 0.34375,
+      "reward_std": 0.3356248140335083,
+      "rewards/accuracy_reward/mean": 0.34375,
+      "rewards/accuracy_reward/std": 0.47682511806488037,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998549818992615,
+      "sampling/importance_sampling_ratio/min": 9.012701411847956e-06,
+      "sampling/sampling_logp_difference/max": 11.616875648498535,
+      "sampling/sampling_logp_difference/mean": 0.02010391652584076,
+      "step": 161
+    },
+    {
+      "clip_ratio/high_max": 4.470512521947967e-06,
+      "clip_ratio/high_mean": 1.1176281304869917e-06,
+      "clip_ratio/low_mean": 3.5141094485879876e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.625872295742738e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 13212.0,
+      "completions/mean_length": 5742.21875,
+      "completions/mean_terminated_length": 5658.42529296875,
+      "completions/min_length": 235.0,
+      "completions/min_terminated_length": 235.0,
+      "entropy": 1.0379670709371567,
+      "epoch": 0.14903403863845446,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0018227624241262674,
+      "learning_rate": 1e-05,
+      "loss": -0.0237,
+      "num_tokens": 124279031.0,
+      "reward": 0.21875,
+      "reward_std": 0.2869548797607422,
+      "rewards/accuracy_reward/mean": 0.21875,
+      "rewards/accuracy_reward/std": 0.41502299904823303,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998506903648376,
+      "sampling/importance_sampling_ratio/min": 0.0020977305248379707,
+      "sampling/sampling_logp_difference/max": 6.16689920425415,
+      "sampling/sampling_logp_difference/mean": 0.019987668842077255,
+      "step": 162
+    },
+    {
+      "clip_ratio/high_max": 1.0003542683989508e-05,
+      "clip_ratio/high_mean": 3.21091931709816e-06,
+      "clip_ratio/low_mean": 5.731009014198207e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 6.0521009800140746e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16354.0,
+      "completions/mean_length": 7584.703125,
+      "completions/mean_terminated_length": 7515.41748046875,
+      "completions/min_length": 884.0,
+      "completions/min_terminated_length": 884.0,
+      "entropy": 0.953459307551384,
+      "epoch": 0.1499540018399264,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.002219022251665592,
+      "learning_rate": 1e-05,
+      "loss": 0.0837,
+      "num_tokens": 125270761.0,
+      "reward": 0.359375,
+      "reward_std": 0.37033066153526306,
+      "rewards/accuracy_reward/mean": 0.359375,
+      "rewards/accuracy_reward/std": 0.481702595949173,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999880790710449,
+      "sampling/importance_sampling_ratio/min": 0.0024849213659763336,
+      "sampling/sampling_logp_difference/max": 5.997514247894287,
+      "sampling/sampling_logp_difference/mean": 0.020291510969400406,
+      "step": 163
+    },
+    {
+      "clip_ratio/high_max": 7.734669452474918e-06,
+      "clip_ratio/high_mean": 1.9336673631187296e-06,
+      "clip_ratio/low_mean": 3.1135301298945706e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.3068968605221016e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16133.0,
+      "completions/mean_length": 4714.671875,
+      "completions/mean_terminated_length": 4622.78759765625,
+      "completions/min_length": 371.0,
+      "completions/min_terminated_length": 371.0,
+      "entropy": 1.018719919025898,
+      "epoch": 0.15087396504139836,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0014189074281603098,
+      "learning_rate": 1e-05,
+      "loss": 0.0501,
+      "num_tokens": 125895279.0,
+      "reward": 0.3984375,
+      "reward_std": 0.28383445739746094,
+      "rewards/accuracy_reward/mean": 0.3984375,
+      "rewards/accuracy_reward/std": 0.4915000796318054,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999479651451111,
+      "sampling/importance_sampling_ratio/min": 4.017410901724361e-07,
+      "sampling/sampling_logp_difference/max": 14.727458000183105,
+      "sampling/sampling_logp_difference/mean": 0.018739396706223488,
+      "step": 164
+    },
+    {
+      "clip_ratio/high_max": 1.0069575182569679e-05,
+      "clip_ratio/high_mean": 2.5173937956424197e-06,
+      "clip_ratio/low_mean": 3.824179225375701e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.0759185367278405e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15913.0,
+      "completions/mean_length": 6316.140625,
+      "completions/mean_terminated_length": 6074.51220703125,
+      "completions/min_length": 751.0,
+      "completions/min_terminated_length": 751.0,
+      "entropy": 0.9325072392821312,
+      "epoch": 0.15179392824287027,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.001702460227534175,
+      "learning_rate": 1e-05,
+      "loss": 0.1007,
+      "num_tokens": 126722881.0,
+      "reward": 0.4609375,
+      "reward_std": 0.3056321144104004,
+      "rewards/accuracy_reward/mean": 0.4609375,
+      "rewards/accuracy_reward/std": 0.5004304051399231,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999539852142334,
+      "sampling/importance_sampling_ratio/min": 0.0012551364488899708,
+      "sampling/sampling_logp_difference/max": 6.680510997772217,
+      "sampling/sampling_logp_difference/mean": 0.01929408684372902,
+      "step": 165
+    },
+    {
+      "clip_ratio/high_max": 6.873041002108948e-06,
+      "clip_ratio/high_mean": 1.718260250527237e-06,
+      "clip_ratio/low_mean": 3.119859468370123e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.291685527528898e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15832.0,
+      "completions/mean_length": 4687.140625,
+      "completions/mean_terminated_length": 4595.03955078125,
+      "completions/min_length": 310.0,
+      "completions/min_terminated_length": 310.0,
+      "entropy": 1.0886607319116592,
+      "epoch": 0.15271389144434222,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0032931750174611807,
+      "learning_rate": 1e-05,
+      "loss": 0.0078,
+      "num_tokens": 127341715.0,
+      "reward": 0.28125,
+      "reward_std": 0.2835350036621094,
+      "rewards/accuracy_reward/mean": 0.28125,
+      "rewards/accuracy_reward/std": 0.4513758420944214,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999821186065674,
+      "sampling/importance_sampling_ratio/min": 0.0019364450126886368,
+      "sampling/sampling_logp_difference/max": 6.246901512145996,
+      "sampling/sampling_logp_difference/mean": 0.020621225237846375,
+      "step": 166
+    },
+    {
+      "clip_ratio/high_max": 1.773085250533768e-05,
+      "clip_ratio/high_mean": 4.43271312633442e-06,
+      "clip_ratio/low_mean": 4.30743207289197e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.7507033741567284e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14125.0,
+      "completions/mean_length": 5705.515625,
+      "completions/mean_terminated_length": 5449.232421875,
+      "completions/min_length": 4.0,
+      "completions/min_terminated_length": 4.0,
+      "entropy": 1.0523068830370903,
+      "epoch": 0.15363385464581417,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0031696646474301815,
+      "learning_rate": 1e-05,
+      "loss": -0.0414,
+      "num_tokens": 128093597.0,
+      "reward": 0.1953125,
+      "reward_std": 0.21778053045272827,
+      "rewards/accuracy_reward/mean": 0.1953125,
+      "rewards/accuracy_reward/std": 0.3979988098144531,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999619126319885,
+      "sampling/importance_sampling_ratio/min": 3.197810656274669e-05,
+      "sampling/sampling_logp_difference/max": 10.350459098815918,
+      "sampling/sampling_logp_difference/mean": 0.021961934864521027,
+      "step": 167
+    },
+    {
+      "clip_ratio/high_max": 1.885905066956184e-05,
+      "clip_ratio/high_mean": 4.71476266739046e-06,
+      "clip_ratio/low_mean": 5.0530389898995054e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.524515336219338e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15958.0,
+      "completions/mean_length": 6214.4921875,
+      "completions/mean_terminated_length": 6053.07177734375,
+      "completions/min_length": 533.0,
+      "completions/min_terminated_length": 533.0,
+      "entropy": 0.9371421113610268,
+      "epoch": 0.1545538178472861,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0023704832419753075,
+      "learning_rate": 1e-05,
+      "loss": 0.075,
+      "num_tokens": 128906948.0,
+      "reward": 0.40625,
+      "reward_std": 0.34139877557754517,
+      "rewards/accuracy_reward/mean": 0.40625,
+      "rewards/accuracy_reward/std": 0.4930621087551117,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.000023365020752,
+      "sampling/importance_sampling_ratio/min": 0.0003354824730195105,
+      "sampling/sampling_logp_difference/max": 7.999940872192383,
+      "sampling/sampling_logp_difference/mean": 0.01882763020694256,
+      "step": 168
+    },
+    {
+      "clip_ratio/high_max": 3.042072216885572e-05,
+      "clip_ratio/high_mean": 7.60518054221393e-06,
+      "clip_ratio/low_mean": 4.5897569179942366e-05,
+      "clip_ratio/low_min": 8.727477506909054e-06,
+      "clip_ratio/region_mean": 5.3502750233747065e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15865.0,
+      "completions/mean_length": 7127.0703125,
+      "completions/mean_terminated_length": 7054.18115234375,
+      "completions/min_length": 402.0,
+      "completions/min_terminated_length": 402.0,
+      "entropy": 0.9854387491941452,
+      "epoch": 0.15547378104875806,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.003370177699252963,
+      "learning_rate": 1e-05,
+      "loss": 0.1197,
+      "num_tokens": 129839813.0,
+      "reward": 0.359375,
+      "reward_std": 0.3329663574695587,
+      "rewards/accuracy_reward/mean": 0.359375,
+      "rewards/accuracy_reward/std": 0.481702595949173,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999907910823822,
+      "sampling/importance_sampling_ratio/min": 1.077816432371037e-05,
+      "sampling/sampling_logp_difference/max": 11.43798828125,
+      "sampling/sampling_logp_difference/mean": 0.019736800342798233,
+      "step": 169
+    },
+    {
+      "clip_ratio/high_max": 2.1401074718596647e-05,
+      "clip_ratio/high_mean": 6.243764005375851e-06,
+      "clip_ratio/low_mean": 3.2797592325550795e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.904135610355297e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15865.0,
+      "completions/mean_length": 6566.2890625,
+      "completions/mean_terminated_length": 6330.6640625,
+      "completions/min_length": 969.0,
+      "completions/min_terminated_length": 969.0,
+      "entropy": 0.7978609576821327,
+      "epoch": 0.15639374425023,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0026055986527353525,
+      "learning_rate": 1e-05,
+      "loss": 0.0661,
+      "num_tokens": 130698370.0,
+      "reward": 0.5,
+      "reward_std": 0.36295419931411743,
+      "rewards/accuracy_reward/mean": 0.5,
+      "rewards/accuracy_reward/std": 0.5019646286964417,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999133944511414,
+      "sampling/importance_sampling_ratio/min": 0.00031152591691352427,
+      "sampling/sampling_logp_difference/max": 8.074028015136719,
+      "sampling/sampling_logp_difference/mean": 0.01787097379565239,
+      "step": 170
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 2.0564424403346493e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.0564424403346493e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15576.0,
+      "completions/max_terminated_length": 15576.0,
+      "completions/mean_length": 7186.2890625,
+      "completions/mean_terminated_length": 7186.2890625,
+      "completions/min_length": 351.0,
+      "completions/min_terminated_length": 351.0,
+      "entropy": 1.0232757329940796,
+      "epoch": 0.15731370745170192,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.0023866184055805206,
+      "learning_rate": 1e-05,
+      "loss": 0.0683,
+      "num_tokens": 131637439.0,
+      "reward": 0.2734375,
+      "reward_std": 0.2059282809495926,
+      "rewards/accuracy_reward/mean": 0.2734375,
+      "rewards/accuracy_reward/std": 0.447474867105484,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999207258224487,
+      "sampling/importance_sampling_ratio/min": 0.0007378471200354397,
+      "sampling/sampling_logp_difference/max": 7.211773872375488,
+      "sampling/sampling_logp_difference/mean": 0.02137116715312004,
+      "step": 171
+    },
+    {
+      "clip_ratio/high_max": 4.037900725961663e-05,
+      "clip_ratio/high_mean": 1.0094751814904157e-05,
+      "clip_ratio/low_mean": 5.8380828136250784e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 6.847557995115494e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 13638.0,
+      "completions/mean_length": 5591.5703125,
+      "completions/mean_terminated_length": 5420.26220703125,
+      "completions/min_length": 635.0,
+      "completions/min_terminated_length": 635.0,
+      "entropy": 0.9335208311676979,
+      "epoch": 0.15823367065317387,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.003491115989163518,
+      "learning_rate": 1e-05,
+      "loss": 0.0699,
+      "num_tokens": 132371816.0,
+      "reward": 0.5,
+      "reward_std": 0.3406373858451843,
+      "rewards/accuracy_reward/mean": 0.5,
+      "rewards/accuracy_reward/std": 0.5019646286964417,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999891459941864,
+      "sampling/importance_sampling_ratio/min": 0.00012356207298580557,
+      "sampling/sampling_logp_difference/max": 8.998766899108887,
+      "sampling/sampling_logp_difference/mean": 0.018760837614536285,
+      "step": 172
+    },
+    {
+      "clip_ratio/high_max": 2.8378776733006816e-06,
+      "clip_ratio/high_mean": 7.094694183251704e-07,
+      "clip_ratio/low_mean": 4.4085751369493664e-05,
+      "clip_ratio/low_min": 6.7955093072669115e-06,
+      "clip_ratio/region_mean": 4.4795220674132e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16302.0,
+      "completions/mean_length": 7152.3828125,
+      "completions/mean_terminated_length": 6930.82421875,
+      "completions/min_length": 281.0,
+      "completions/min_terminated_length": 281.0,
+      "entropy": 1.1329835206270218,
+      "epoch": 0.15915363385464582,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.002830669516697526,
+      "learning_rate": 1e-05,
+      "loss": 0.0526,
+      "num_tokens": 133307297.0,
+      "reward": 0.28125,
+      "reward_std": 0.28801077604293823,
+      "rewards/accuracy_reward/mean": 0.28125,
+      "rewards/accuracy_reward/std": 0.4513758420944214,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999501705169678,
+      "sampling/importance_sampling_ratio/min": 0.00028047082014381886,
+      "sampling/sampling_logp_difference/max": 8.179040908813477,
+      "sampling/sampling_logp_difference/mean": 0.021548541262745857,
+      "step": 173
+    },
+    {
+      "clip_ratio/high_max": 1.0150829439226072e-05,
+      "clip_ratio/high_mean": 2.537707359806518e-06,
+      "clip_ratio/low_mean": 3.4009618616437365e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.654732597624388e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15068.0,
+      "completions/mean_length": 7263.453125,
+      "completions/mean_terminated_length": 7118.68310546875,
+      "completions/min_length": 352.0,
+      "completions/min_terminated_length": 352.0,
+      "entropy": 1.092760555446148,
+      "epoch": 0.16007359705611776,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0027821618132293224,
+      "learning_rate": 1e-05,
+      "loss": 0.0541,
+      "num_tokens": 134260107.0,
+      "reward": 0.3203125,
+      "reward_std": 0.2858940362930298,
+      "rewards/accuracy_reward/mean": 0.3203125,
+      "rewards/accuracy_reward/std": 0.4684300124645233,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999946117401123,
+      "sampling/importance_sampling_ratio/min": 7.832317351130769e-05,
+      "sampling/sampling_logp_difference/max": 9.454667091369629,
+      "sampling/sampling_logp_difference/mean": 0.022098438814282417,
+      "step": 174
+    },
+    {
+      "clip_ratio/high_max": 1.0561876024439698e-05,
+      "clip_ratio/high_mean": 2.6404690061099245e-06,
+      "clip_ratio/low_mean": 1.6864279416495265e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 1.9504748649978865e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15388.0,
+      "completions/mean_length": 7088.8125,
+      "completions/mean_terminated_length": 6710.958984375,
+      "completions/min_length": 1314.0,
+      "completions/min_terminated_length": 1314.0,
+      "entropy": 1.0669445469975471,
+      "epoch": 0.1609935602575897,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.0007076738984324038,
+      "learning_rate": 1e-05,
+      "loss": -0.0197,
+      "num_tokens": 135186139.0,
+      "reward": 0.328125,
+      "reward_std": 0.20593319833278656,
+      "rewards/accuracy_reward/mean": 0.328125,
+      "rewards/accuracy_reward/std": 0.4713755249977112,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998199343681335,
+      "sampling/importance_sampling_ratio/min": 3.084653872065246e-05,
+      "sampling/sampling_logp_difference/max": 10.386486053466797,
+      "sampling/sampling_logp_difference/mean": 0.020075790584087372,
+      "step": 175
+    },
+    {
+      "clip_ratio/high_max": 7.095016371749807e-06,
+      "clip_ratio/high_mean": 1.7737540929374518e-06,
+      "clip_ratio/low_mean": 2.7592465016823553e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.936621888238733e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15626.0,
+      "completions/max_terminated_length": 15626.0,
+      "completions/mean_length": 5352.734375,
+      "completions/mean_terminated_length": 5352.734375,
+      "completions/min_length": 333.0,
+      "completions/min_terminated_length": 333.0,
+      "entropy": 1.0387161895632744,
+      "epoch": 0.16191352345906163,
+      "frac_reward_zero_std": 0.125,
+      "grad_norm": 0.0022445612121373415,
+      "learning_rate": 1e-05,
+      "loss": 0.0261,
+      "num_tokens": 135888929.0,
+      "reward": 0.4765625,
+      "reward_std": 0.399257630109787,
+      "rewards/accuracy_reward/mean": 0.4765625,
+      "rewards/accuracy_reward/std": 0.5014128684997559,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999054670333862,
+      "sampling/importance_sampling_ratio/min": 0.00032565294532105327,
+      "sampling/sampling_logp_difference/max": 8.029678344726562,
+      "sampling/sampling_logp_difference/mean": 0.02010166086256504,
+      "step": 176
+    },
+    {
+      "clip_ratio/high_max": 1.5100852124305675e-05,
+      "clip_ratio/high_mean": 4.426987970873597e-06,
+      "clip_ratio/low_mean": 2.7625993425317574e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.2052981168817496e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16266.0,
+      "completions/mean_length": 7758.90625,
+      "completions/mean_terminated_length": 7408.29248046875,
+      "completions/min_length": 742.0,
+      "completions/min_terminated_length": 742.0,
+      "entropy": 1.0648984238505363,
+      "epoch": 0.16283348666053357,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0022021254990249872,
+      "learning_rate": 1e-05,
+      "loss": 0.0621,
+      "num_tokens": 136901941.0,
+      "reward": 0.3671875,
+      "reward_std": 0.2914257347583771,
+      "rewards/accuracy_reward/mean": 0.3671875,
+      "rewards/accuracy_reward/std": 0.4839322865009308,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999858140945435,
+      "sampling/importance_sampling_ratio/min": 2.2461865967216e-07,
+      "sampling/sampling_logp_difference/max": 15.30886173248291,
+      "sampling/sampling_logp_difference/mean": 0.021426808089017868,
+      "step": 177
+    },
+    {
+      "clip_ratio/high_max": 2.5346608254039893e-05,
+      "clip_ratio/high_mean": 7.4063813144675805e-06,
+      "clip_ratio/low_mean": 2.2069365058996482e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.9475746259777225e-05,
+      "completions/clipped_ratio": 0.0546875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16277.0,
+      "completions/mean_length": 7036.953125,
+      "completions/mean_terminated_length": 6496.21484375,
+      "completions/min_length": 4.0,
+      "completions/min_terminated_length": 4.0,
+      "entropy": 0.9684997871518135,
+      "epoch": 0.16375344986200552,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0013461806811392307,
+      "learning_rate": 1e-05,
+      "loss": 0.035,
+      "num_tokens": 137824623.0,
+      "reward": 0.34375,
+      "reward_std": 0.2546031177043915,
+      "rewards/accuracy_reward/mean": 0.34375,
+      "rewards/accuracy_reward/std": 0.47682511806488037,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999944806098938,
+      "sampling/importance_sampling_ratio/min": 5.834372132085264e-05,
+      "sampling/sampling_logp_difference/max": 9.74915885925293,
+      "sampling/sampling_logp_difference/mean": 0.020304443314671516,
+      "step": 178
+    },
+    {
+      "clip_ratio/high_max": 1.3147734080121154e-05,
+      "clip_ratio/high_mean": 3.2869335200302885e-06,
+      "clip_ratio/low_mean": 4.841489999307669e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.170183294467279e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15500.0,
+      "completions/mean_length": 6114.1875,
+      "completions/mean_terminated_length": 5951.1748046875,
+      "completions/min_length": 223.0,
+      "completions/min_terminated_length": 223.0,
+      "entropy": 0.943072073161602,
+      "epoch": 0.16467341306347746,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.002132438588887453,
+      "learning_rate": 1e-05,
+      "loss": 0.0943,
+      "num_tokens": 138625247.0,
+      "reward": 0.40625,
+      "reward_std": 0.321650892496109,
+      "rewards/accuracy_reward/mean": 0.40625,
+      "rewards/accuracy_reward/std": 0.4930621087551117,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999298453330994,
+      "sampling/importance_sampling_ratio/min": 0.0017275095451623201,
+      "sampling/sampling_logp_difference/max": 6.361074447631836,
+      "sampling/sampling_logp_difference/mean": 0.020084267482161522,
+      "step": 179
+    },
+    {
+      "clip_ratio/high_max": 1.7873157958092634e-05,
+      "clip_ratio/high_mean": 4.468289489523158e-06,
+      "clip_ratio/low_mean": 3.5252990301160025e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.9721279790683184e-05,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15050.0,
+      "completions/mean_length": 7618.875,
+      "completions/mean_terminated_length": 7034.53369140625,
+      "completions/min_length": 1030.0,
+      "completions/min_terminated_length": 1030.0,
+      "entropy": 0.9142575263977051,
+      "epoch": 0.1655933762649494,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0026741649489849806,
+      "learning_rate": 1e-05,
+      "loss": 0.0666,
+      "num_tokens": 139619287.0,
+      "reward": 0.2890625,
+      "reward_std": 0.2927239239215851,
+      "rewards/accuracy_reward/mean": 0.2890625,
+      "rewards/accuracy_reward/std": 0.45510825514793396,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998897314071655,
+      "sampling/importance_sampling_ratio/min": 0.005949751473963261,
+      "sampling/sampling_logp_difference/max": 5.124405860900879,
+      "sampling/sampling_logp_difference/mean": 0.020061582326889038,
+      "step": 180
+    },
+    {
+      "clip_ratio/high_max": 1.0512151675357018e-05,
+      "clip_ratio/high_mean": 2.6280379188392544e-06,
+      "clip_ratio/low_mean": 4.5301517502593924e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.792955542143318e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 16106.0,
+      "completions/max_terminated_length": 16106.0,
+      "completions/mean_length": 5333.875,
+      "completions/mean_terminated_length": 5333.875,
+      "completions/min_length": 1109.0,
+      "completions/min_terminated_length": 1109.0,
+      "entropy": 0.8107482865452766,
+      "epoch": 0.16651333946642136,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0027016003150492907,
+      "learning_rate": 1e-05,
+      "loss": 0.0544,
+      "num_tokens": 140318935.0,
+      "reward": 0.5703125,
+      "reward_std": 0.2556639611721039,
+      "rewards/accuracy_reward/mean": 0.5703125,
+      "rewards/accuracy_reward/std": 0.4969765841960907,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.000013828277588,
+      "sampling/importance_sampling_ratio/min": 0.006856904830783606,
+      "sampling/sampling_logp_difference/max": 4.982499122619629,
+      "sampling/sampling_logp_difference/mean": 0.017069874331355095,
+      "step": 181
+    },
+    {
+      "clip_ratio/high_max": 1.85085939392593e-05,
+      "clip_ratio/high_mean": 5.24943533264377e-06,
+      "clip_ratio/low_mean": 5.6120721524166584e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 6.137015702734061e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16050.0,
+      "completions/mean_length": 7443.3046875,
+      "completions/mean_terminated_length": 7154.89501953125,
+      "completions/min_length": 19.0,
+      "completions/min_terminated_length": 19.0,
+      "entropy": 0.9224414080381393,
+      "epoch": 0.16743330266789327,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.002655779244378209,
+      "learning_rate": 1e-05,
+      "loss": 0.0466,
+      "num_tokens": 141293534.0,
+      "reward": 0.234375,
+      "reward_std": 0.2398776262998581,
+      "rewards/accuracy_reward/mean": 0.234375,
+      "rewards/accuracy_reward/std": 0.42527204751968384,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999659061431885,
+      "sampling/importance_sampling_ratio/min": 0.00042018835665658116,
+      "sampling/sampling_logp_difference/max": 7.774807453155518,
+      "sampling/sampling_logp_difference/mean": 0.02006504125893116,
+      "step": 182
+    },
+    {
+      "clip_ratio/high_max": 1.494229445597739e-05,
+      "clip_ratio/high_mean": 3.7355736139943474e-06,
+      "clip_ratio/low_mean": 2.2748562741981004e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.6484136355975352e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15923.0,
+      "completions/max_terminated_length": 15923.0,
+      "completions/mean_length": 5646.6875,
+      "completions/mean_terminated_length": 5646.6875,
+      "completions/min_length": 342.0,
+      "completions/min_terminated_length": 342.0,
+      "entropy": 0.8945339694619179,
+      "epoch": 0.16835326586936522,
+      "frac_reward_zero_std": 0.625,
+      "grad_norm": 0.0016281780553981662,
+      "learning_rate": 1e-05,
+      "loss": 0.0288,
+      "num_tokens": 142037438.0,
+      "reward": 0.46875,
+      "reward_std": 0.17912296950817108,
+      "rewards/accuracy_reward/mean": 0.46875,
+      "rewards/accuracy_reward/std": 0.5009832978248596,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.000030517578125,
+      "sampling/importance_sampling_ratio/min": 0.0005717006279155612,
+      "sampling/sampling_logp_difference/max": 7.46689510345459,
+      "sampling/sampling_logp_difference/mean": 0.019336247816681862,
+      "step": 183
+    },
+    {
+      "clip_ratio/high_max": 3.335990868436056e-05,
+      "clip_ratio/high_mean": 8.33997717109014e-06,
+      "clip_ratio/low_mean": 3.5050728683927446e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.339070608239126e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14142.0,
+      "completions/mean_length": 6384.640625,
+      "completions/mean_terminated_length": 5892.86865234375,
+      "completions/min_length": 130.0,
+      "completions/min_terminated_length": 130.0,
+      "entropy": 0.840093269944191,
+      "epoch": 0.16927322907083717,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.002166559686884284,
+      "learning_rate": 1e-05,
+      "loss": 0.0011,
+      "num_tokens": 142873848.0,
+      "reward": 0.4765625,
+      "reward_std": 0.35506346821784973,
+      "rewards/accuracy_reward/mean": 0.4765625,
+      "rewards/accuracy_reward/std": 0.5014128684997559,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000462532043457,
+      "sampling/importance_sampling_ratio/min": 4.785555574926548e-06,
+      "sampling/sampling_logp_difference/max": 12.249908447265625,
+      "sampling/sampling_logp_difference/mean": 0.018109092488884926,
+      "step": 184
+    },
+    {
+      "clip_ratio/high_max": 1.541105484648142e-05,
+      "clip_ratio/high_mean": 3.852763711620355e-06,
+      "clip_ratio/low_mean": 4.0552770769863855e-05,
+      "clip_ratio/low_min": 7.133888630050933e-06,
+      "clip_ratio/region_mean": 4.440553459517105e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14828.0,
+      "completions/mean_length": 5775.0,
+      "completions/mean_terminated_length": 5691.46435546875,
+      "completions/min_length": 1147.0,
+      "completions/min_terminated_length": 1147.0,
+      "entropy": 0.8915362879633904,
+      "epoch": 0.1701931922723091,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0021932912059128284,
+      "learning_rate": 1e-05,
+      "loss": -0.0086,
+      "num_tokens": 143636152.0,
+      "reward": 0.4375,
+      "reward_std": 0.32325342297554016,
+      "rewards/accuracy_reward/mean": 0.4375,
+      "rewards/accuracy_reward/std": 0.49802759289741516,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.000008225440979,
+      "sampling/importance_sampling_ratio/min": 9.714113069492214e-09,
+      "sampling/sampling_logp_difference/max": 18.44968605041504,
+      "sampling/sampling_logp_difference/mean": 0.019278086721897125,
+      "step": 185
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 2.7509142171311396e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.7509142171311396e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15122.0,
+      "completions/mean_length": 6181.640625,
+      "completions/mean_terminated_length": 6019.69873046875,
+      "completions/min_length": 473.0,
+      "completions/min_terminated_length": 473.0,
+      "entropy": 1.0544511675834656,
+      "epoch": 0.17111315547378106,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.0022947140969336033,
+      "learning_rate": 1e-05,
+      "loss": 0.0242,
+      "num_tokens": 144447370.0,
+      "reward": 0.234375,
+      "reward_std": 0.2022808939218521,
+      "rewards/accuracy_reward/mean": 0.234375,
+      "rewards/accuracy_reward/std": 0.42527204751968384,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999147653579712,
+      "sampling/importance_sampling_ratio/min": 7.419757253046555e-08,
+      "sampling/sampling_logp_difference/max": 16.416534423828125,
+      "sampling/sampling_logp_difference/mean": 0.02050788700580597,
+      "step": 186
+    },
+    {
+      "clip_ratio/high_max": 1.5700999938417226e-05,
+      "clip_ratio/high_mean": 3.9252499846043065e-06,
+      "clip_ratio/low_mean": 2.4595847037289786e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.8521096965050674e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15824.0,
+      "completions/mean_length": 6542.3046875,
+      "completions/mean_terminated_length": 6306.1044921875,
+      "completions/min_length": 628.0,
+      "completions/min_terminated_length": 628.0,
+      "entropy": 0.933225467801094,
+      "epoch": 0.17203311867525298,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0034910975955426693,
+      "learning_rate": 1e-05,
+      "loss": 0.0977,
+      "num_tokens": 145303505.0,
+      "reward": 0.390625,
+      "reward_std": 0.30433881282806396,
+      "rewards/accuracy_reward/mean": 0.390625,
+      "rewards/accuracy_reward/std": 0.4898075461387634,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999945163726807,
+      "sampling/importance_sampling_ratio/min": 0.007213745731860399,
+      "sampling/sampling_logp_difference/max": 4.931766986846924,
+      "sampling/sampling_logp_difference/mean": 0.020022759214043617,
+      "step": 187
+    },
+    {
+      "clip_ratio/high_max": 6.0999414017715026e-06,
+      "clip_ratio/high_mean": 1.5249853504428756e-06,
+      "clip_ratio/low_mean": 2.61421698724007e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.7667155109156738e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15365.0,
+      "completions/mean_length": 5889.4765625,
+      "completions/mean_terminated_length": 5637.6083984375,
+      "completions/min_length": 99.0,
+      "completions/min_terminated_length": 99.0,
+      "entropy": 0.9649673849344254,
+      "epoch": 0.17295308187672492,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0024078311398625374,
+      "learning_rate": 1e-05,
+      "loss": 0.0391,
+      "num_tokens": 146082198.0,
+      "reward": 0.3359375,
+      "reward_std": 0.2698703408241272,
+      "rewards/accuracy_reward/mean": 0.3359375,
+      "rewards/accuracy_reward/std": 0.47417303919792175,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999341368675232,
+      "sampling/importance_sampling_ratio/min": 0.0008680344326421618,
+      "sampling/sampling_logp_difference/max": 7.04927921295166,
+      "sampling/sampling_logp_difference/mean": 0.02060198038816452,
+      "step": 188
+    },
+    {
+      "clip_ratio/high_max": 7.789618393871933e-06,
+      "clip_ratio/high_mean": 1.9474045984679833e-06,
+      "clip_ratio/low_mean": 3.6395756637830345e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.834316100892465e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16233.0,
+      "completions/mean_length": 5349.2421875,
+      "completions/mean_terminated_length": 5084.408203125,
+      "completions/min_length": 678.0,
+      "completions/min_terminated_length": 678.0,
+      "entropy": 0.8402756005525589,
+      "epoch": 0.17387304507819687,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0021191861014813185,
+      "learning_rate": 1e-05,
+      "loss": 0.1275,
+      "num_tokens": 146786245.0,
+      "reward": 0.4765625,
+      "reward_std": 0.2801200747489929,
+      "rewards/accuracy_reward/mean": 0.4765625,
+      "rewards/accuracy_reward/std": 0.5014128684997559,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999837875366211,
+      "sampling/importance_sampling_ratio/min": 3.763807762879878e-05,
+      "sampling/sampling_logp_difference/max": 10.187494277954102,
+      "sampling/sampling_logp_difference/mean": 0.017112664878368378,
+      "step": 189
+    },
+    {
+      "clip_ratio/high_max": 1.2461773394534248e-05,
+      "clip_ratio/high_mean": 3.115443348633562e-06,
+      "clip_ratio/low_mean": 5.095924211673264e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.4074685294835945e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15786.0,
+      "completions/mean_length": 7272.3203125,
+      "completions/mean_terminated_length": 7053.64013671875,
+      "completions/min_length": 1074.0,
+      "completions/min_terminated_length": 1074.0,
+      "entropy": 0.9627499282360077,
+      "epoch": 0.17479300827966882,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0022120666690170765,
+      "learning_rate": 1e-05,
+      "loss": 0.0079,
+      "num_tokens": 147737086.0,
+      "reward": 0.2890625,
+      "reward_std": 0.27304792404174805,
+      "rewards/accuracy_reward/mean": 0.2890625,
+      "rewards/accuracy_reward/std": 0.45510825514793396,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999538660049438,
+      "sampling/importance_sampling_ratio/min": 1.6960719221970066e-05,
+      "sampling/sampling_logp_difference/max": 10.984610557556152,
+      "sampling/sampling_logp_difference/mean": 0.0203307643532753,
+      "step": 190
+    },
+    {
+      "clip_ratio/high_max": 1.7891727566166082e-05,
+      "clip_ratio/high_mean": 4.472931891541521e-06,
+      "clip_ratio/low_mean": 5.616715043288423e-05,
+      "clip_ratio/low_min": 7.80031223257538e-06,
+      "clip_ratio/region_mean": 6.064008221073891e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16212.0,
+      "completions/mean_length": 6387.1875,
+      "completions/mean_terminated_length": 5895.54052734375,
+      "completions/min_length": 1310.0,
+      "completions/min_terminated_length": 1310.0,
+      "entropy": 0.9110158830881119,
+      "epoch": 0.17571297148114076,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0030851473566144705,
+      "learning_rate": 1e-05,
+      "loss": 0.1091,
+      "num_tokens": 148573782.0,
+      "reward": 0.40625,
+      "reward_std": 0.31246688961982727,
+      "rewards/accuracy_reward/mean": 0.40625,
+      "rewards/accuracy_reward/std": 0.4930621087551117,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.99997878074646,
+      "sampling/importance_sampling_ratio/min": 0.003961040172725916,
+      "sampling/sampling_logp_difference/max": 5.531248569488525,
+      "sampling/sampling_logp_difference/mean": 0.018049638718366623,
+      "step": 191
+    },
+    {
+      "clip_ratio/high_max": 1.6994396901282016e-05,
+      "clip_ratio/high_mean": 5.400205964178895e-06,
+      "clip_ratio/low_mean": 3.274822392995702e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.8148429439388565e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16176.0,
+      "completions/mean_length": 7267.59375,
+      "completions/mean_terminated_length": 7195.81103515625,
+      "completions/min_length": 653.0,
+      "completions/min_terminated_length": 653.0,
+      "entropy": 0.9254888147115707,
+      "epoch": 0.1766329346826127,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0020694085396826267,
+      "learning_rate": 1e-05,
+      "loss": 0.0462,
+      "num_tokens": 149521258.0,
+      "reward": 0.2734375,
+      "reward_std": 0.29719972610473633,
+      "rewards/accuracy_reward/mean": 0.2734375,
+      "rewards/accuracy_reward/std": 0.447474867105484,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999054670333862,
+      "sampling/importance_sampling_ratio/min": 7.411616934405174e-06,
+      "sampling/sampling_logp_difference/max": 11.812461853027344,
+      "sampling/sampling_logp_difference/mean": 0.01898832805454731,
+      "step": 192
+    },
+    {
+      "clip_ratio/high_max": 4.10414668294834e-06,
+      "clip_ratio/high_mean": 1.026036670737085e-06,
+      "clip_ratio/low_mean": 4.7441100377909606e-05,
+      "clip_ratio/low_min": 4.552241534838686e-06,
+      "clip_ratio/region_mean": 4.8467136821273016e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16076.0,
+      "completions/mean_length": 7100.1953125,
+      "completions/mean_terminated_length": 6952.83349609375,
+      "completions/min_length": 560.0,
+      "completions/min_terminated_length": 560.0,
+      "entropy": 0.8455610796809196,
+      "epoch": 0.17755289788408463,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.003085972974076867,
+      "learning_rate": 1e-05,
+      "loss": 0.0108,
+      "num_tokens": 150447923.0,
+      "reward": 0.25,
+      "reward_std": 0.23645778000354767,
+      "rewards/accuracy_reward/mean": 0.25,
+      "rewards/accuracy_reward/std": 0.434714138507843,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999178647994995,
+      "sampling/importance_sampling_ratio/min": 0.0011708807433024049,
+      "sampling/sampling_logp_difference/max": 6.749999046325684,
+      "sampling/sampling_logp_difference/mean": 0.01974140852689743,
+      "step": 193
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 1.6514521121280268e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 1.6514521121280268e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15535.0,
+      "completions/mean_length": 6626.4296875,
+      "completions/mean_terminated_length": 6549.5986328125,
+      "completions/min_length": 1746.0,
+      "completions/min_terminated_length": 1746.0,
+      "entropy": 1.0323699787259102,
+      "epoch": 0.17847286108555657,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.003505800850689411,
+      "learning_rate": 1e-05,
+      "loss": 0.0885,
+      "num_tokens": 151313834.0,
+      "reward": 0.390625,
+      "reward_std": 0.17176413536071777,
+      "rewards/accuracy_reward/mean": 0.390625,
+      "rewards/accuracy_reward/std": 0.4898075461387634,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999381303787231,
+      "sampling/importance_sampling_ratio/min": 2.8102756914449856e-05,
+      "sampling/sampling_logp_difference/max": 10.479642868041992,
+      "sampling/sampling_logp_difference/mean": 0.021082937717437744,
+      "step": 194
+    },
+    {
+      "clip_ratio/high_max": 2.006086378969485e-05,
+      "clip_ratio/high_mean": 5.890002398700744e-06,
+      "clip_ratio/low_mean": 3.503898199141986e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.092898473118112e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15595.0,
+      "completions/mean_length": 7093.109375,
+      "completions/mean_terminated_length": 6870.12841796875,
+      "completions/min_length": 69.0,
+      "completions/min_terminated_length": 69.0,
+      "entropy": 1.0206764563918114,
+      "epoch": 0.17939282428702852,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.002495395252481103,
+      "learning_rate": 1e-05,
+      "loss": 0.0308,
+      "num_tokens": 152238192.0,
+      "reward": 0.2890625,
+      "reward_std": 0.31800350546836853,
+      "rewards/accuracy_reward/mean": 0.2890625,
+      "rewards/accuracy_reward/std": 0.45510825514793396,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999728798866272,
+      "sampling/importance_sampling_ratio/min": 9.536534344078973e-05,
+      "sampling/sampling_logp_difference/max": 9.257795333862305,
+      "sampling/sampling_logp_difference/mean": 0.020610272884368896,
+      "step": 195
+    },
+    {
+      "clip_ratio/high_max": 3.2352409107261337e-06,
+      "clip_ratio/high_mean": 8.088102276815334e-07,
+      "clip_ratio/low_mean": 4.056704699451075e-05,
+      "clip_ratio/low_min": 1.1648833606159315e-05,
+      "clip_ratio/region_mean": 4.1375856994818605e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14191.0,
+      "completions/mean_length": 6795.71875,
+      "completions/mean_terminated_length": 6486.4189453125,
+      "completions/min_length": 424.0,
+      "completions/min_terminated_length": 424.0,
+      "entropy": 0.8927837759256363,
+      "epoch": 0.18031278748850046,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0014066790463402867,
+      "learning_rate": 1e-05,
+      "loss": -0.0031,
+      "num_tokens": 153131828.0,
+      "reward": 0.3359375,
+      "reward_std": 0.2409384548664093,
+      "rewards/accuracy_reward/mean": 0.3359375,
+      "rewards/accuracy_reward/std": 0.47417303919792175,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998855590820312,
+      "sampling/importance_sampling_ratio/min": 5.093755135021638e-06,
+      "sampling/sampling_logp_difference/max": 12.187495231628418,
+      "sampling/sampling_logp_difference/mean": 0.01874586008489132,
+      "step": 196
+    },
+    {
+      "clip_ratio/high_max": 1.5244630048982799e-05,
+      "clip_ratio/high_mean": 3.8111575122456998e-06,
+      "clip_ratio/low_mean": 3.655197178886738e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.03631290737394e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15831.0,
+      "completions/mean_length": 7075.1015625,
+      "completions/mean_terminated_length": 6617.28662109375,
+      "completions/min_length": 813.0,
+      "completions/min_terminated_length": 813.0,
+      "entropy": 0.8989318311214447,
+      "epoch": 0.1812327506899724,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0017937121447175741,
+      "learning_rate": 1e-05,
+      "loss": 0.0359,
+      "num_tokens": 154057097.0,
+      "reward": 0.3984375,
+      "reward_std": 0.23068872094154358,
+      "rewards/accuracy_reward/mean": 0.3984375,
+      "rewards/accuracy_reward/std": 0.4915000796318054,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998950958251953,
+      "sampling/importance_sampling_ratio/min": 0.00021659507183358073,
+      "sampling/sampling_logp_difference/max": 8.437480926513672,
+      "sampling/sampling_logp_difference/mean": 0.01890135183930397,
+      "step": 197
+    },
+    {
+      "clip_ratio/high_max": 1.4074375030759256e-05,
+      "clip_ratio/high_mean": 4.977033995601232e-06,
+      "clip_ratio/low_mean": 3.2670792506905855e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.764782627513341e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14100.0,
+      "completions/mean_length": 7120.0,
+      "completions/mean_terminated_length": 6743.41455078125,
+      "completions/min_length": 78.0,
+      "completions/min_terminated_length": 78.0,
+      "entropy": 0.8758384585380554,
+      "epoch": 0.18215271389144433,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.003410576842725277,
+      "learning_rate": 1e-05,
+      "loss": 0.0536,
+      "num_tokens": 154988585.0,
+      "reward": 0.3984375,
+      "reward_std": 0.2845958471298218,
+      "rewards/accuracy_reward/mean": 0.3984375,
+      "rewards/accuracy_reward/std": 0.4915000796318054,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999953508377075,
+      "sampling/importance_sampling_ratio/min": 0.003589102067053318,
+      "sampling/sampling_logp_difference/max": 5.629853248596191,
+      "sampling/sampling_logp_difference/mean": 0.018400676548480988,
+      "step": 198
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 1.977112736994968e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 1.977112736994968e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15821.0,
+      "completions/mean_length": 6590.6796875,
+      "completions/mean_terminated_length": 6513.56689453125,
+      "completions/min_length": 23.0,
+      "completions/min_terminated_length": 23.0,
+      "entropy": 0.9243742749094963,
+      "epoch": 0.18307267709291627,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.003304310142993927,
+      "learning_rate": 1e-05,
+      "loss": 0.0585,
+      "num_tokens": 155851000.0,
+      "reward": 0.3984375,
+      "reward_std": 0.29036980867385864,
+      "rewards/accuracy_reward/mean": 0.3984375,
+      "rewards/accuracy_reward/std": 0.4915000796318054,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999579787254333,
+      "sampling/importance_sampling_ratio/min": 1.2693599273916334e-06,
+      "sampling/sampling_logp_difference/max": 13.576997756958008,
+      "sampling/sampling_logp_difference/mean": 0.01959652081131935,
+      "step": 199
+    },
+    {
+      "clip_ratio/high_max": 1.1435367014200892e-05,
+      "clip_ratio/high_mean": 2.858841753550223e-06,
+      "clip_ratio/low_mean": 4.7742656533955596e-05,
+      "clip_ratio/low_min": 8.646529749967158e-06,
+      "clip_ratio/region_mean": 5.0601498060132144e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16115.0,
+      "completions/mean_length": 6999.484375,
+      "completions/mean_terminated_length": 6696.7578125,
+      "completions/min_length": 445.0,
+      "completions/min_terminated_length": 445.0,
+      "entropy": 0.843244343996048,
+      "epoch": 0.18399264029438822,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0023830258287489414,
+      "learning_rate": 1e-05,
+      "loss": 0.1142,
+      "num_tokens": 156766782.0,
+      "reward": 0.359375,
+      "reward_std": 0.2885475754737854,
+      "rewards/accuracy_reward/mean": 0.359375,
+      "rewards/accuracy_reward/std": 0.481702595949173,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998635053634644,
+      "sampling/importance_sampling_ratio/min": 0.00014761318743694574,
+      "sampling/sampling_logp_difference/max": 8.820915222167969,
+      "sampling/sampling_logp_difference/mean": 0.018434934318065643,
+      "step": 200
+    },
+    {
+      "clip_ratio/high_max": 2.5114631171163637e-05,
+      "clip_ratio/high_mean": 7.040741365926806e-06,
+      "clip_ratio/low_mean": 5.3607667723554187e-05,
+      "clip_ratio/low_min": 9.219345429301029e-06,
+      "clip_ratio/region_mean": 6.064840863473364e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14986.0,
+      "completions/mean_length": 6407.5,
+      "completions/mean_terminated_length": 6249.14306640625,
+      "completions/min_length": 351.0,
+      "completions/min_terminated_length": 351.0,
+      "entropy": 0.9549195989966393,
+      "epoch": 0.18491260349586017,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0024427250027656555,
+      "learning_rate": 1e-05,
+      "loss": 0.0795,
+      "num_tokens": 157606126.0,
+      "reward": 0.3515625,
+      "reward_std": 0.32879000902175903,
+      "rewards/accuracy_reward/mean": 0.3515625,
+      "rewards/accuracy_reward/std": 0.4793342351913452,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999966025352478,
+      "sampling/importance_sampling_ratio/min": 0.0002305622911080718,
+      "sampling/sampling_logp_difference/max": 8.37498950958252,
+      "sampling/sampling_logp_difference/mean": 0.0192743968218565,
+      "step": 201
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 2.928529067958152e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.928529067958152e-05,
+      "completions/clipped_ratio": 0.0703125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15519.0,
+      "completions/mean_length": 6638.390625,
+      "completions/mean_terminated_length": 5901.328125,
+      "completions/min_length": 56.0,
+      "completions/min_terminated_length": 56.0,
+      "entropy": 0.9070822075009346,
+      "epoch": 0.1858325666973321,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.002024515997618437,
+      "learning_rate": 1e-05,
+      "loss": 0.0604,
+      "num_tokens": 158474248.0,
+      "reward": 0.4140625,
+      "reward_std": 0.28117600083351135,
+      "rewards/accuracy_reward/mean": 0.4140625,
+      "rewards/accuracy_reward/std": 0.49449479579925537,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999830722808838,
+      "sampling/importance_sampling_ratio/min": 0.0036068728659301996,
+      "sampling/sampling_logp_difference/max": 5.624914169311523,
+      "sampling/sampling_logp_difference/mean": 0.01955476775765419,
+      "step": 202
+    },
+    {
+      "clip_ratio/high_max": 8.365173471247545e-06,
+      "clip_ratio/high_mean": 2.091293367811886e-06,
+      "clip_ratio/low_mean": 4.1470637825113954e-05,
+      "clip_ratio/low_min": 4.027710474474588e-06,
+      "clip_ratio/region_mean": 4.356193130661268e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15708.0,
+      "completions/mean_length": 7324.546875,
+      "completions/mean_terminated_length": 6878.99951171875,
+      "completions/min_length": 571.0,
+      "completions/min_terminated_length": 571.0,
+      "entropy": 0.9108889549970627,
+      "epoch": 0.18675252989880406,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0022787705529481173,
+      "learning_rate": 1e-05,
+      "loss": 0.0616,
+      "num_tokens": 159434350.0,
+      "reward": 0.3359375,
+      "reward_std": 0.26515230536460876,
+      "rewards/accuracy_reward/mean": 0.3359375,
+      "rewards/accuracy_reward/std": 0.47417303919792175,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999351501464844,
+      "sampling/importance_sampling_ratio/min": 0.03948089852929115,
+      "sampling/sampling_logp_difference/max": 3.231938362121582,
+      "sampling/sampling_logp_difference/mean": 0.019122496247291565,
+      "step": 203
+    },
+    {
+      "clip_ratio/high_max": 8.65733409227687e-06,
+      "clip_ratio/high_mean": 2.1643335230692173e-06,
+      "clip_ratio/low_mean": 3.456336048657249e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.672769389595487e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 13983.0,
+      "completions/mean_length": 5520.4453125,
+      "completions/mean_terminated_length": 5434.9052734375,
+      "completions/min_length": 73.0,
+      "completions/min_terminated_length": 73.0,
+      "entropy": 0.8982062339782715,
+      "epoch": 0.18767249310027598,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0026195270475000143,
+      "learning_rate": 1e-05,
+      "loss": 0.049,
+      "num_tokens": 160163055.0,
+      "reward": 0.4375,
+      "reward_std": 0.24831004440784454,
+      "rewards/accuracy_reward/mean": 0.4375,
+      "rewards/accuracy_reward/std": 0.49802759289741516,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998810291290283,
+      "sampling/importance_sampling_ratio/min": 0.0005541297141462564,
+      "sampling/sampling_logp_difference/max": 7.498111724853516,
+      "sampling/sampling_logp_difference/mean": 0.019064132124185562,
+      "step": 204
+    },
+    {
+      "clip_ratio/high_max": 1.8376186289970065e-05,
+      "clip_ratio/high_mean": 6.650576210631698e-06,
+      "clip_ratio/low_mean": 4.059042771586974e-05,
+      "clip_ratio/low_min": 5.350111223378917e-06,
+      "clip_ratio/region_mean": 4.724100449493562e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15267.0,
+      "completions/max_terminated_length": 15267.0,
+      "completions/mean_length": 6846.515625,
+      "completions/mean_terminated_length": 6846.515625,
+      "completions/min_length": 63.0,
+      "completions/min_terminated_length": 63.0,
+      "entropy": 0.9657742157578468,
+      "epoch": 0.18859245630174792,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0014831912703812122,
+      "learning_rate": 1e-05,
+      "loss": 0.006,
+      "num_tokens": 161057657.0,
+      "reward": 0.296875,
+      "reward_std": 0.27198708057403564,
+      "rewards/accuracy_reward/mean": 0.296875,
+      "rewards/accuracy_reward/std": 0.45867621898651123,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999252557754517,
+      "sampling/importance_sampling_ratio/min": 6.252834282349795e-05,
+      "sampling/sampling_logp_difference/max": 9.679890632629395,
+      "sampling/sampling_logp_difference/mean": 0.020372584462165833,
+      "step": 205
+    },
+    {
+      "clip_ratio/high_max": 1.658901419432368e-05,
+      "clip_ratio/high_mean": 4.14725354858092e-06,
+      "clip_ratio/low_mean": 4.473214539757464e-05,
+      "clip_ratio/low_min": 2.9674999950657366e-06,
+      "clip_ratio/region_mean": 4.887939894615556e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16370.0,
+      "completions/mean_length": 6946.8984375,
+      "completions/mean_terminated_length": 6642.4755859375,
+      "completions/min_length": 1133.0,
+      "completions/min_terminated_length": 1133.0,
+      "entropy": 0.8490508273243904,
+      "epoch": 0.18951241950321987,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0017962189158424735,
+      "learning_rate": 1e-05,
+      "loss": 0.0696,
+      "num_tokens": 161966356.0,
+      "reward": 0.4296875,
+      "reward_std": 0.33114415407180786,
+      "rewards/accuracy_reward/mean": 0.4296875,
+      "rewards/accuracy_reward/std": 0.4969765841960907,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999545216560364,
+      "sampling/importance_sampling_ratio/min": 7.035569433355704e-05,
+      "sampling/sampling_logp_difference/max": 9.561946868896484,
+      "sampling/sampling_logp_difference/mean": 0.019146796315908432,
+      "step": 206
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 3.22491199540309e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.22491199540309e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15123.0,
+      "completions/mean_length": 6618.9765625,
+      "completions/mean_terminated_length": 6463.9765625,
+      "completions/min_length": 529.0,
+      "completions/min_terminated_length": 529.0,
+      "entropy": 0.9541772454977036,
+      "epoch": 0.19043238270469182,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0017619321588426828,
+      "learning_rate": 1e-05,
+      "loss": 0.0509,
+      "num_tokens": 162836705.0,
+      "reward": 0.390625,
+      "reward_std": 0.2130674123764038,
+      "rewards/accuracy_reward/mean": 0.390625,
+      "rewards/accuracy_reward/std": 0.4898075461387634,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999436140060425,
+      "sampling/importance_sampling_ratio/min": 4.2106199771296815e-07,
+      "sampling/sampling_logp_difference/max": 14.680485725402832,
+      "sampling/sampling_logp_difference/mean": 0.020236656069755554,
+      "step": 207
+    },
+    {
+      "clip_ratio/high_max": 1.6846054222696694e-05,
+      "clip_ratio/high_mean": 4.211513555674173e-06,
+      "clip_ratio/low_mean": 3.877300162002939e-05,
+      "clip_ratio/low_min": 4.230834292684449e-06,
+      "clip_ratio/region_mean": 4.298451551676408e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 12469.0,
+      "completions/mean_length": 5485.71875,
+      "completions/mean_terminated_length": 5312.73046875,
+      "completions/min_length": 104.0,
+      "completions/min_terminated_length": 104.0,
+      "entropy": 0.8888534903526306,
+      "epoch": 0.19135234590616376,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.002670915797352791,
+      "learning_rate": 1e-05,
+      "loss": 0.0709,
+      "num_tokens": 163558197.0,
+      "reward": 0.46875,
+      "reward_std": 0.3145885467529297,
+      "rewards/accuracy_reward/mean": 0.46875,
+      "rewards/accuracy_reward/std": 0.5009832978248596,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000442266464233,
+      "sampling/importance_sampling_ratio/min": 0.0005042250850237906,
+      "sampling/sampling_logp_difference/max": 7.592487812042236,
+      "sampling/sampling_logp_difference/mean": 0.019581373780965805,
+      "step": 208
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 3.6889288480779214e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.6889288480779214e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16184.0,
+      "completions/mean_length": 4345.171875,
+      "completions/mean_terminated_length": 4250.3779296875,
+      "completions/min_length": 68.0,
+      "completions/min_terminated_length": 68.0,
+      "entropy": 0.8308270424604416,
+      "epoch": 0.1922723091076357,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.004005427472293377,
+      "learning_rate": 1e-05,
+      "loss": 0.1072,
+      "num_tokens": 164133499.0,
+      "reward": 0.578125,
+      "reward_std": 0.31642353534698486,
+      "rewards/accuracy_reward/mean": 0.578125,
+      "rewards/accuracy_reward/std": 0.4957992732524872,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999247193336487,
+      "sampling/importance_sampling_ratio/min": 0.022981969639658928,
+      "sampling/sampling_logp_difference/max": 3.773045301437378,
+      "sampling/sampling_logp_difference/mean": 0.017508968710899353,
+      "step": 209
+    },
+    {
+      "clip_ratio/high_max": 1.2997116300539346e-05,
+      "clip_ratio/high_mean": 3.2492790751348366e-06,
+      "clip_ratio/low_mean": 2.723402121773688e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.0483300406558556e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15509.0,
+      "completions/mean_length": 5227.296875,
+      "completions/mean_terminated_length": 5050.20654296875,
+      "completions/min_length": 5.0,
+      "completions/min_terminated_length": 5.0,
+      "entropy": 0.9231975972652435,
+      "epoch": 0.19319227230910763,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0031033784616738558,
+      "learning_rate": 1e-05,
+      "loss": 0.0179,
+      "num_tokens": 164823681.0,
+      "reward": 0.4765625,
+      "reward_std": 0.29249146580696106,
+      "rewards/accuracy_reward/mean": 0.4765625,
+      "rewards/accuracy_reward/std": 0.5014128684997559,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999896764755249,
+      "sampling/importance_sampling_ratio/min": 0.0021342060063034296,
+      "sampling/sampling_logp_difference/max": 6.149660587310791,
+      "sampling/sampling_logp_difference/mean": 0.019171088933944702,
+      "step": 210
+    },
+    {
+      "clip_ratio/high_max": 2.0835890609305352e-05,
+      "clip_ratio/high_mean": 5.208972652326338e-06,
+      "clip_ratio/low_mean": 2.9314877565411734e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.452385044511175e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14160.0,
+      "completions/mean_length": 6473.4765625,
+      "completions/mean_terminated_length": 6316.1669921875,
+      "completions/min_length": 726.0,
+      "completions/min_terminated_length": 726.0,
+      "entropy": 0.9061874598264694,
+      "epoch": 0.19411223551057957,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.003495733719319105,
+      "learning_rate": 1e-05,
+      "loss": 0.0785,
+      "num_tokens": 165668798.0,
+      "reward": 0.4765625,
+      "reward_std": 0.3469353914260864,
+      "rewards/accuracy_reward/mean": 0.4765625,
+      "rewards/accuracy_reward/std": 0.5014128684997559,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000354051589966,
+      "sampling/importance_sampling_ratio/min": 0.0004697878030128777,
+      "sampling/sampling_logp_difference/max": 7.663229465484619,
+      "sampling/sampling_logp_difference/mean": 0.018978482112288475,
+      "step": 211
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 3.991967162164656e-05,
+      "clip_ratio/low_min": 6.304534053924726e-06,
+      "clip_ratio/region_mean": 3.991967162164656e-05,
+      "completions/clipped_ratio": 0.0546875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14659.0,
+      "completions/mean_length": 7140.1953125,
+      "completions/mean_terminated_length": 6605.4296875,
+      "completions/min_length": 141.0,
+      "completions/min_terminated_length": 141.0,
+      "entropy": 0.9605444446206093,
+      "epoch": 0.19503219871205152,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.002381941769272089,
+      "learning_rate": 1e-05,
+      "loss": 0.027,
+      "num_tokens": 166603375.0,
+      "reward": 0.3046875,
+      "reward_std": 0.27776598930358887,
+      "rewards/accuracy_reward/mean": 0.3046875,
+      "rewards/accuracy_reward/std": 0.46208351850509644,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999864935874939,
+      "sampling/importance_sampling_ratio/min": 0.00043123820796608925,
+      "sampling/sampling_logp_difference/max": 7.748849868774414,
+      "sampling/sampling_logp_difference/mean": 0.021141134202480316,
+      "step": 212
+    },
+    {
+      "clip_ratio/high_max": 1.4948576790629886e-05,
+      "clip_ratio/high_mean": 3.7371441976574715e-06,
+      "clip_ratio/low_mean": 3.4953729482367635e-05,
+      "clip_ratio/low_min": 3.991060111729894e-06,
+      "clip_ratio/region_mean": 3.869087413477246e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 13770.0,
+      "completions/mean_length": 5304.46875,
+      "completions/mean_terminated_length": 5038.56005859375,
+      "completions/min_length": 342.0,
+      "completions/min_terminated_length": 342.0,
+      "entropy": 0.9176690131425858,
+      "epoch": 0.19595216191352346,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0040566748939454556,
+      "learning_rate": 1e-05,
+      "loss": 0.0534,
+      "num_tokens": 167302275.0,
+      "reward": 0.4296875,
+      "reward_std": 0.33114415407180786,
+      "rewards/accuracy_reward/mean": 0.4296875,
+      "rewards/accuracy_reward/std": 0.4969765841960907,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999827742576599,
+      "sampling/importance_sampling_ratio/min": 5.001809313398553e-07,
+      "sampling/sampling_logp_difference/max": 14.508296012878418,
+      "sampling/sampling_logp_difference/mean": 0.018822530284523964,
+      "step": 213
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 2.653866999935417e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.653866999935417e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15791.0,
+      "completions/mean_length": 5796.5,
+      "completions/mean_terminated_length": 5542.400390625,
+      "completions/min_length": 407.0,
+      "completions/min_terminated_length": 407.0,
+      "entropy": 0.9230027198791504,
+      "epoch": 0.1968721251149954,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0021502040326595306,
+      "learning_rate": 1e-05,
+      "loss": 0.0737,
+      "num_tokens": 168063627.0,
+      "reward": 0.3828125,
+      "reward_std": 0.3158818483352661,
+      "rewards/accuracy_reward/mean": 0.3828125,
+      "rewards/accuracy_reward/std": 0.4879830479621887,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999223351478577,
+      "sampling/importance_sampling_ratio/min": 0.009504453279078007,
+      "sampling/sampling_logp_difference/max": 4.655994892120361,
+      "sampling/sampling_logp_difference/mean": 0.01985779032111168,
+      "step": 214
+    },
+    {
+      "clip_ratio/high_max": 1.0863841453101486e-05,
+      "clip_ratio/high_mean": 2.7159603632753715e-06,
+      "clip_ratio/low_mean": 2.4175752741939505e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.6891713218901714e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14814.0,
+      "completions/mean_length": 6135.4921875,
+      "completions/mean_terminated_length": 6054.79541015625,
+      "completions/min_length": 1259.0,
+      "completions/min_terminated_length": 1259.0,
+      "entropy": 0.869445689022541,
+      "epoch": 0.19779208831646733,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0027786416467279196,
+      "learning_rate": 1e-05,
+      "loss": 0.0139,
+      "num_tokens": 168867858.0,
+      "reward": 0.4609375,
+      "reward_std": 0.3366856575012207,
+      "rewards/accuracy_reward/mean": 0.4609375,
+      "rewards/accuracy_reward/std": 0.5004304051399231,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999550580978394,
+      "sampling/importance_sampling_ratio/min": 2.6089865059475414e-05,
+      "sampling/sampling_logp_difference/max": 10.553963661193848,
+      "sampling/sampling_logp_difference/mean": 0.018514130264520645,
+      "step": 215
+    },
+    {
+      "clip_ratio/high_max": 4.36788013757905e-06,
+      "clip_ratio/high_mean": 1.0919700343947625e-06,
+      "clip_ratio/low_mean": 1.993327998661698e-06,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.0852980330564606e-06,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15691.0,
+      "completions/mean_length": 6268.2421875,
+      "completions/mean_terminated_length": 6025.46435546875,
+      "completions/min_length": 627.0,
+      "completions/min_terminated_length": 627.0,
+      "entropy": 0.951081782579422,
+      "epoch": 0.19871205151793928,
+      "frac_reward_zero_std": 0.75,
+      "grad_norm": 0.0007328780484385788,
+      "learning_rate": 1e-05,
+      "loss": 0.0188,
+      "num_tokens": 169689969.0,
+      "reward": 0.3828125,
+      "reward_std": 0.10994865000247955,
+      "rewards/accuracy_reward/mean": 0.3828125,
+      "rewards/accuracy_reward/std": 0.4879830479621887,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000133514404297,
+      "sampling/importance_sampling_ratio/min": 1.6650999896228313e-05,
+      "sampling/sampling_logp_difference/max": 11.003040313720703,
+      "sampling/sampling_logp_difference/mean": 0.02005261555314064,
+      "step": 216
+    },
+    {
+      "clip_ratio/high_max": 2.131336282218399e-05,
+      "clip_ratio/high_mean": 5.3283407055459975e-06,
+      "clip_ratio/low_mean": 3.5254403428552905e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.058274430462916e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 13861.0,
+      "completions/mean_length": 5440.8984375,
+      "completions/mean_terminated_length": 5354.732421875,
+      "completions/min_length": 413.0,
+      "completions/min_terminated_length": 413.0,
+      "entropy": 0.8271932750940323,
+      "epoch": 0.19963201471941122,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0034721922129392624,
+      "learning_rate": 1e-05,
+      "loss": -0.0245,
+      "num_tokens": 170409292.0,
+      "reward": 0.53125,
+      "reward_std": 0.30327308177948,
+      "rewards/accuracy_reward/mean": 0.53125,
+      "rewards/accuracy_reward/std": 0.5009832978248596,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998912811279297,
+      "sampling/importance_sampling_ratio/min": 1.8372484191786498e-05,
+      "sampling/sampling_logp_difference/max": 10.904656410217285,
+      "sampling/sampling_logp_difference/mean": 0.019136395305395126,
+      "step": 217
+    },
+    {
+      "clip_ratio/high_max": 1.2339016848272877e-05,
+      "clip_ratio/high_mean": 4.13687178024702e-06,
+      "clip_ratio/low_mean": 2.156280152121326e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.569967330146028e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15086.0,
+      "completions/mean_length": 6671.046875,
+      "completions/mean_terminated_length": 6594.56689453125,
+      "completions/min_length": 748.0,
+      "completions/min_terminated_length": 748.0,
+      "entropy": 0.9659745842218399,
+      "epoch": 0.20055197792088317,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0027575206477195024,
+      "learning_rate": 1e-05,
+      "loss": 0.0286,
+      "num_tokens": 171280714.0,
+      "reward": 0.375,
+      "reward_std": 0.2109457552433014,
+      "rewards/accuracy_reward/mean": 0.375,
+      "rewards/accuracy_reward/std": 0.4860251843929291,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999411702156067,
+      "sampling/importance_sampling_ratio/min": 1.5700872609158978e-05,
+      "sampling/sampling_logp_difference/max": 11.06179428100586,
+      "sampling/sampling_logp_difference/mean": 0.019089506939053535,
+      "step": 218
+    },
+    {
+      "clip_ratio/high_max": 1.4603458112105727e-05,
+      "clip_ratio/high_mean": 3.650864528026432e-06,
+      "clip_ratio/low_mean": 3.2977761520669446e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.662862599185246e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15752.0,
+      "completions/mean_length": 7781.5546875,
+      "completions/mean_terminated_length": 7504.05615234375,
+      "completions/min_length": 429.0,
+      "completions/min_terminated_length": 429.0,
+      "entropy": 1.1691131889820099,
+      "epoch": 0.2014719411223551,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.0012711051385849714,
+      "learning_rate": 1e-05,
+      "loss": 0.0115,
+      "num_tokens": 172302489.0,
+      "reward": 0.109375,
+      "reward_std": 0.1751839816570282,
+      "rewards/accuracy_reward/mean": 0.109375,
+      "rewards/accuracy_reward/std": 0.31333550810813904,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998820424079895,
+      "sampling/importance_sampling_ratio/min": 0.005086081102490425,
+      "sampling/sampling_logp_difference/max": 5.281247615814209,
+      "sampling/sampling_logp_difference/mean": 0.023309212177991867,
+      "step": 219
+    },
+    {
+      "clip_ratio/high_max": 6.842087486802484e-06,
+      "clip_ratio/high_mean": 1.710521871700621e-06,
+      "clip_ratio/low_mean": 4.5269940528669395e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.6980462457213434e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14891.0,
+      "completions/mean_length": 6489.96875,
+      "completions/mean_terminated_length": 6332.9208984375,
+      "completions/min_length": 556.0,
+      "completions/min_terminated_length": 556.0,
+      "entropy": 0.9354017227888107,
+      "epoch": 0.20239190432382706,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0016933141741901636,
+      "learning_rate": 1e-05,
+      "loss": 0.0156,
+      "num_tokens": 173149653.0,
+      "reward": 0.484375,
+      "reward_std": 0.32325342297554016,
+      "rewards/accuracy_reward/mean": 0.484375,
+      "rewards/accuracy_reward/std": 0.5017194747924805,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999572038650513,
+      "sampling/importance_sampling_ratio/min": 0.008998609147965908,
+      "sampling/sampling_logp_difference/max": 4.7106852531433105,
+      "sampling/sampling_logp_difference/mean": 0.019165027886629105,
+      "step": 220
+    },
+    {
+      "clip_ratio/high_max": 2.444740721330163e-05,
+      "clip_ratio/high_mean": 6.111851803325408e-06,
+      "clip_ratio/low_mean": 3.0998270403870265e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.711012095664046e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 14943.0,
+      "completions/max_terminated_length": 14943.0,
+      "completions/mean_length": 6309.75,
+      "completions/mean_terminated_length": 6309.75,
+      "completions/min_length": 474.0,
+      "completions/min_terminated_length": 474.0,
+      "entropy": 1.012483686208725,
+      "epoch": 0.20331186752529898,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0024940327275544405,
+      "learning_rate": 1e-05,
+      "loss": 0.0552,
+      "num_tokens": 173976797.0,
+      "reward": 0.4375,
+      "reward_std": 0.2790592610836029,
+      "rewards/accuracy_reward/mean": 0.4375,
+      "rewards/accuracy_reward/std": 0.49802759289741516,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999861121177673,
+      "sampling/importance_sampling_ratio/min": 0.0018720829393714666,
+      "sampling/sampling_logp_difference/max": 6.280703544616699,
+      "sampling/sampling_logp_difference/mean": 0.020797956734895706,
+      "step": 221
+    },
+    {
+      "clip_ratio/high_max": 1.1112337460872368e-05,
+      "clip_ratio/high_mean": 3.5388877677178243e-06,
+      "clip_ratio/low_mean": 1.7024583712554886e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.056347148027271e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16362.0,
+      "completions/mean_length": 7574.984375,
+      "completions/mean_terminated_length": 7363.568359375,
+      "completions/min_length": 598.0,
+      "completions/min_terminated_length": 598.0,
+      "entropy": 0.9144782647490501,
+      "epoch": 0.20423183072677092,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.002748408354818821,
+      "learning_rate": 1e-05,
+      "loss": 0.0588,
+      "num_tokens": 174965259.0,
+      "reward": 0.2734375,
+      "reward_std": 0.25224411487579346,
+      "rewards/accuracy_reward/mean": 0.2734375,
+      "rewards/accuracy_reward/std": 0.447474867105484,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000108480453491,
+      "sampling/importance_sampling_ratio/min": 0.005681300535798073,
+      "sampling/sampling_logp_difference/max": 5.170575141906738,
+      "sampling/sampling_logp_difference/mean": 0.019229793921113014,
+      "step": 222
+    },
+    {
+      "clip_ratio/high_max": 1.4946090004741563e-05,
+      "clip_ratio/high_mean": 3.736522501185391e-06,
+      "clip_ratio/low_mean": 3.722507381098694e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.096159636901575e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15365.0,
+      "completions/mean_length": 6962.7734375,
+      "completions/mean_terminated_length": 6499.43408203125,
+      "completions/min_length": 780.0,
+      "completions/min_terminated_length": 780.0,
+      "entropy": 0.9248140156269073,
+      "epoch": 0.20515179392824287,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0020343128126114607,
+      "learning_rate": 1e-05,
+      "loss": 0.0714,
+      "num_tokens": 175876446.0,
+      "reward": 0.421875,
+      "reward_std": 0.3156445026397705,
+      "rewards/accuracy_reward/mean": 0.421875,
+      "rewards/accuracy_reward/std": 0.4957992732524872,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999679327011108,
+      "sampling/importance_sampling_ratio/min": 0.0001609467581147328,
+      "sampling/sampling_logp_difference/max": 8.734436988830566,
+      "sampling/sampling_logp_difference/mean": 0.01860032044351101,
+      "step": 223
+    },
+    {
+      "clip_ratio/high_max": 4.226114015182247e-06,
+      "clip_ratio/high_mean": 1.0565285037955618e-06,
+      "clip_ratio/low_mean": 3.189400638348161e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.295053488727717e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14978.0,
+      "completions/mean_length": 6422.28125,
+      "completions/mean_terminated_length": 6264.1591796875,
+      "completions/min_length": 117.0,
+      "completions/min_terminated_length": 117.0,
+      "entropy": 0.7786787301301956,
+      "epoch": 0.20607175712971482,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0029119597747921944,
+      "learning_rate": 1e-05,
+      "loss": 0.1116,
+      "num_tokens": 176717226.0,
+      "reward": 0.578125,
+      "reward_std": 0.27328526973724365,
+      "rewards/accuracy_reward/mean": 0.578125,
+      "rewards/accuracy_reward/std": 0.4957992732524872,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999918937683105,
+      "sampling/importance_sampling_ratio/min": 0.0006287595024332404,
+      "sampling/sampling_logp_difference/max": 7.371761798858643,
+      "sampling/sampling_logp_difference/mean": 0.01786171644926071,
+      "step": 224
+    },
+    {
+      "clip_ratio/high_max": 5.4112551879370585e-06,
+      "clip_ratio/high_mean": 1.3528137969842646e-06,
+      "clip_ratio/low_mean": 2.103693077515345e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.2389744572137715e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16030.0,
+      "completions/mean_length": 6662.65625,
+      "completions/mean_terminated_length": 6508.349609375,
+      "completions/min_length": 486.0,
+      "completions/min_terminated_length": 486.0,
+      "entropy": 0.9501350447535515,
+      "epoch": 0.20699172033118676,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.0027519147843122482,
+      "learning_rate": 1e-05,
+      "loss": 0.0204,
+      "num_tokens": 177586766.0,
+      "reward": 0.421875,
+      "reward_std": 0.21382881700992584,
+      "rewards/accuracy_reward/mean": 0.421875,
+      "rewards/accuracy_reward/std": 0.4957992732524872,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000051259994507,
+      "sampling/importance_sampling_ratio/min": 2.507045428501442e-05,
+      "sampling/sampling_logp_difference/max": 10.593820571899414,
+      "sampling/sampling_logp_difference/mean": 0.020679686218500137,
+      "step": 225
+    },
+    {
+      "clip_ratio/high_max": 3.2487785119883483e-06,
+      "clip_ratio/high_mean": 8.121946279970871e-07,
+      "clip_ratio/low_mean": 5.783435085504607e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.8646545539886574e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15419.0,
+      "completions/mean_length": 6546.171875,
+      "completions/mean_terminated_length": 6146.259765625,
+      "completions/min_length": 839.0,
+      "completions/min_terminated_length": 839.0,
+      "entropy": 0.9217342138290405,
+      "epoch": 0.20791168353265868,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0017936143558472395,
+      "learning_rate": 1e-05,
+      "loss": 0.0748,
+      "num_tokens": 178444556.0,
+      "reward": 0.3984375,
+      "reward_std": 0.2909066081047058,
+      "rewards/accuracy_reward/mean": 0.3984375,
+      "rewards/accuracy_reward/std": 0.4915000796318054,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000327825546265,
+      "sampling/importance_sampling_ratio/min": 8.447741129202768e-05,
+      "sampling/sampling_logp_difference/max": 9.379026412963867,
+      "sampling/sampling_logp_difference/mean": 0.019764548167586327,
+      "step": 226
+    },
+    {
+      "clip_ratio/high_max": 2.1980493102091714e-05,
+      "clip_ratio/high_mean": 5.4951232755229285e-06,
+      "clip_ratio/low_mean": 4.3977801396977156e-05,
+      "clip_ratio/low_min": 7.912247156127705e-06,
+      "clip_ratio/region_mean": 4.947292427459615e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15707.0,
+      "completions/max_terminated_length": 15707.0,
+      "completions/mean_length": 6433.9296875,
+      "completions/mean_terminated_length": 6433.9296875,
+      "completions/min_length": 731.0,
+      "completions/min_terminated_length": 731.0,
+      "entropy": 0.9361409991979599,
+      "epoch": 0.20883164673413063,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0031324021983891726,
+      "learning_rate": 1e-05,
+      "loss": 0.0505,
+      "num_tokens": 179288499.0,
+      "reward": 0.453125,
+      "reward_std": 0.3066929578781128,
+      "rewards/accuracy_reward/mean": 0.453125,
+      "rewards/accuracy_reward/std": 0.4997538626194,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999454021453857,
+      "sampling/importance_sampling_ratio/min": 0.00018488657951820642,
+      "sampling/sampling_logp_difference/max": 8.595767974853516,
+      "sampling/sampling_logp_difference/mean": 0.019691072404384613,
+      "step": 227
+    },
+    {
+      "clip_ratio/high_max": 1.299416817346355e-05,
+      "clip_ratio/high_mean": 3.2485420433658874e-06,
+      "clip_ratio/low_mean": 3.756406420052372e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.081260635757644e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15787.0,
+      "completions/mean_length": 6037.75,
+      "completions/mean_terminated_length": 5873.52392578125,
+      "completions/min_length": 551.0,
+      "completions/min_terminated_length": 551.0,
+      "entropy": 0.8700985535979271,
+      "epoch": 0.20975160993560257,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0024714914616197348,
+      "learning_rate": 1e-05,
+      "loss": 0.0044,
+      "num_tokens": 180079619.0,
+      "reward": 0.484375,
+      "reward_std": 0.21436560153961182,
+      "rewards/accuracy_reward/mean": 0.484375,
+      "rewards/accuracy_reward/std": 0.5017194747924805,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999628067016602,
+      "sampling/importance_sampling_ratio/min": 8.4841696661897e-05,
+      "sampling/sampling_logp_difference/max": 9.374723434448242,
+      "sampling/sampling_logp_difference/mean": 0.018519341945648193,
+      "step": 228
+    },
+    {
+      "clip_ratio/high_max": 7.293307589861797e-06,
+      "clip_ratio/high_mean": 1.8233268974654493e-06,
+      "clip_ratio/low_mean": 2.2305866423266707e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.412919320704532e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 12264.0,
+      "completions/max_terminated_length": 12264.0,
+      "completions/mean_length": 5305.828125,
+      "completions/mean_terminated_length": 5305.828125,
+      "completions/min_length": 229.0,
+      "completions/min_terminated_length": 229.0,
+      "entropy": 1.1309608668088913,
+      "epoch": 0.21067157313707452,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.003593914210796356,
+      "learning_rate": 1e-05,
+      "loss": 0.0478,
+      "num_tokens": 180780877.0,
+      "reward": 0.3984375,
+      "reward_std": 0.24671241641044617,
+      "rewards/accuracy_reward/mean": 0.3984375,
+      "rewards/accuracy_reward/std": 0.4915000796318054,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.000011920928955,
+      "sampling/importance_sampling_ratio/min": 0.009941472671926022,
+      "sampling/sampling_logp_difference/max": 4.611040115356445,
+      "sampling/sampling_logp_difference/mean": 0.020471621304750443,
+      "step": 229
+    },
+    {
+      "clip_ratio/high_max": 2.0163415001661633e-05,
+      "clip_ratio/high_mean": 5.040853750415408e-06,
+      "clip_ratio/low_mean": 4.4980357415624894e-05,
+      "clip_ratio/low_min": 1.0012816346716136e-05,
+      "clip_ratio/region_mean": 5.0021211109196884e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 13814.0,
+      "completions/mean_length": 6022.96875,
+      "completions/mean_terminated_length": 5774.30419921875,
+      "completions/min_length": 556.0,
+      "completions/min_terminated_length": 556.0,
+      "entropy": 0.8560900762677193,
+      "epoch": 0.21159153633854647,
+      "frac_reward_zero_std": 0.125,
+      "grad_norm": 0.0029816587921231985,
+      "learning_rate": 1e-05,
+      "loss": 0.0913,
+      "num_tokens": 181571465.0,
+      "reward": 0.515625,
+      "reward_std": 0.41504397988319397,
+      "rewards/accuracy_reward/mean": 0.515625,
+      "rewards/accuracy_reward/std": 0.5017194747924805,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999518394470215,
+      "sampling/importance_sampling_ratio/min": 1.5958334188326262e-05,
+      "sampling/sampling_logp_difference/max": 11.04552936553955,
+      "sampling/sampling_logp_difference/mean": 0.0181986466050148,
+      "step": 230
+    },
+    {
+      "clip_ratio/high_max": 1.8430865566188004e-05,
+      "clip_ratio/high_mean": 6.177042905619601e-06,
+      "clip_ratio/low_mean": 4.450247388376738e-05,
+      "clip_ratio/low_min": 4.840271230932558e-06,
+      "clip_ratio/region_mean": 5.067951724413433e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15130.0,
+      "completions/max_terminated_length": 15130.0,
+      "completions/mean_length": 6647.71875,
+      "completions/mean_terminated_length": 6647.71875,
+      "completions/min_length": 196.0,
+      "completions/min_terminated_length": 196.0,
+      "entropy": 0.9455481320619583,
+      "epoch": 0.2125114995400184,
+      "frac_reward_zero_std": 0.125,
+      "grad_norm": 0.0031632622703909874,
+      "learning_rate": 1e-05,
+      "loss": 0.1317,
+      "num_tokens": 182440957.0,
+      "reward": 0.3828125,
+      "reward_std": 0.39902517199516296,
+      "rewards/accuracy_reward/mean": 0.3828125,
+      "rewards/accuracy_reward/std": 0.4879830479621887,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000306367874146,
+      "sampling/importance_sampling_ratio/min": 1.4739508515049238e-05,
+      "sampling/sampling_logp_difference/max": 11.124979019165039,
+      "sampling/sampling_logp_difference/mean": 0.01906408555805683,
+      "step": 231
+    },
+    {
+      "clip_ratio/high_max": 2.2937053017813014e-05,
+      "clip_ratio/high_mean": 5.7342632544532535e-06,
+      "clip_ratio/low_mean": 6.042617155799235e-05,
+      "clip_ratio/low_min": 1.1000354334100848e-05,
+      "clip_ratio/region_mean": 6.616043401663774e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15988.0,
+      "completions/mean_length": 6809.1640625,
+      "completions/mean_terminated_length": 6500.29833984375,
+      "completions/min_length": 471.0,
+      "completions/min_terminated_length": 471.0,
+      "entropy": 1.050546184182167,
+      "epoch": 0.21343146274149033,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.00162694591563195,
+      "learning_rate": 1e-05,
+      "loss": 0.0346,
+      "num_tokens": 183332242.0,
+      "reward": 0.421875,
+      "reward_std": 0.33616161346435547,
+      "rewards/accuracy_reward/mean": 0.421875,
+      "rewards/accuracy_reward/std": 0.4957992732524872,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000290870666504,
+      "sampling/importance_sampling_ratio/min": 4.244970114086755e-06,
+      "sampling/sampling_logp_difference/max": 12.369775772094727,
+      "sampling/sampling_logp_difference/mean": 0.021866722032427788,
+      "step": 232
+    },
+    {
+      "clip_ratio/high_max": 1.4678411844215589e-05,
+      "clip_ratio/high_mean": 3.669602961053897e-06,
+      "clip_ratio/low_mean": 2.4373607971028832e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.8043211159456405e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16348.0,
+      "completions/mean_length": 6815.5,
+      "completions/mean_terminated_length": 6506.83837890625,
+      "completions/min_length": 51.0,
+      "completions/min_terminated_length": 51.0,
+      "entropy": 1.060033954679966,
+      "epoch": 0.21435142594296228,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0024887355975806713,
+      "learning_rate": 1e-05,
+      "loss": 0.1059,
+      "num_tokens": 184225138.0,
+      "reward": 0.328125,
+      "reward_std": 0.2869548499584198,
+      "rewards/accuracy_reward/mean": 0.328125,
+      "rewards/accuracy_reward/std": 0.4713755249977112,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999393820762634,
+      "sampling/importance_sampling_ratio/min": 0.00012930770753882825,
+      "sampling/sampling_logp_difference/max": 8.953315734863281,
+      "sampling/sampling_logp_difference/mean": 0.02019432932138443,
+      "step": 233
+    },
+    {
+      "clip_ratio/high_max": 7.910891326901037e-06,
+      "clip_ratio/high_mean": 1.9777228317252593e-06,
+      "clip_ratio/low_mean": 3.8802519611635944e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.078024221598753e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15838.0,
+      "completions/mean_length": 6928.4453125,
+      "completions/mean_terminated_length": 6623.42724609375,
+      "completions/min_length": 304.0,
+      "completions/min_terminated_length": 304.0,
+      "entropy": 0.9051575735211372,
+      "epoch": 0.21527138914443422,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.002783838426694274,
+      "learning_rate": 1e-05,
+      "loss": 0.0624,
+      "num_tokens": 185136323.0,
+      "reward": 0.3359375,
+      "reward_std": 0.25460803508758545,
+      "rewards/accuracy_reward/mean": 0.3359375,
+      "rewards/accuracy_reward/std": 0.47417303919792175,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999524354934692,
+      "sampling/importance_sampling_ratio/min": 1.0146355634788051e-05,
+      "sampling/sampling_logp_difference/max": 11.498395919799805,
+      "sampling/sampling_logp_difference/mean": 0.01905050128698349,
+      "step": 234
+    },
+    {
+      "clip_ratio/high_max": 4.399394583742833e-06,
+      "clip_ratio/high_mean": 1.0998486459357082e-06,
+      "clip_ratio/low_mean": 1.733424267058581e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 1.8434091430208355e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14703.0,
+      "completions/mean_length": 7155.1328125,
+      "completions/mean_terminated_length": 7082.46435546875,
+      "completions/min_length": 111.0,
+      "completions/min_terminated_length": 111.0,
+      "entropy": 1.0119014978408813,
+      "epoch": 0.21619135234590617,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.002105508930981159,
+      "learning_rate": 1e-05,
+      "loss": 0.0655,
+      "num_tokens": 186071324.0,
+      "reward": 0.328125,
+      "reward_std": 0.26303553581237793,
+      "rewards/accuracy_reward/mean": 0.328125,
+      "rewards/accuracy_reward/std": 0.4713755249977112,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999904990196228,
+      "sampling/importance_sampling_ratio/min": 0.003494206117466092,
+      "sampling/sampling_logp_difference/max": 5.656649112701416,
+      "sampling/sampling_logp_difference/mean": 0.020860780030488968,
+      "step": 235
+    },
+    {
+      "clip_ratio/high_max": 1.0561529961705673e-05,
+      "clip_ratio/high_mean": 3.4390433256703545e-06,
+      "clip_ratio/low_mean": 2.8499469067355676e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.193851205196552e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 16176.0,
+      "completions/max_terminated_length": 16176.0,
+      "completions/mean_length": 7463.2421875,
+      "completions/mean_terminated_length": 7463.2421875,
+      "completions/min_length": 698.0,
+      "completions/min_terminated_length": 698.0,
+      "entropy": 0.9983502700924873,
+      "epoch": 0.21711131554737811,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0013582308311015368,
+      "learning_rate": 1e-05,
+      "loss": 0.048,
+      "num_tokens": 187045035.0,
+      "reward": 0.3984375,
+      "reward_std": 0.2517249584197998,
+      "rewards/accuracy_reward/mean": 0.3984375,
+      "rewards/accuracy_reward/std": 0.4915000796318054,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999428987503052,
+      "sampling/importance_sampling_ratio/min": 0.000473080639494583,
+      "sampling/sampling_logp_difference/max": 7.65624475479126,
+      "sampling/sampling_logp_difference/mean": 0.021131811663508415,
+      "step": 236
+    },
+    {
+      "clip_ratio/high_max": 8.509013468938065e-06,
+      "clip_ratio/high_mean": 2.127253367234516e-06,
+      "clip_ratio/low_mean": 3.985050443588989e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.197775751890731e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14938.0,
+      "completions/mean_length": 6460.984375,
+      "completions/mean_terminated_length": 6382.8505859375,
+      "completions/min_length": 1747.0,
+      "completions/min_terminated_length": 1747.0,
+      "entropy": 0.7869217246770859,
+      "epoch": 0.21803127874885003,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.002681629965081811,
+      "learning_rate": 1e-05,
+      "loss": 0.0987,
+      "num_tokens": 187889609.0,
+      "reward": 0.5234375,
+      "reward_std": 0.39082521200180054,
+      "rewards/accuracy_reward/mean": 0.5234375,
+      "rewards/accuracy_reward/std": 0.5014128684997559,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999568462371826,
+      "sampling/importance_sampling_ratio/min": 0.0015037209959700704,
+      "sampling/sampling_logp_difference/max": 6.499812602996826,
+      "sampling/sampling_logp_difference/mean": 0.016937749460339546,
+      "step": 237
+    },
+    {
+      "clip_ratio/high_max": 1.2362176221358823e-05,
+      "clip_ratio/high_mean": 3.0905440553397057e-06,
+      "clip_ratio/low_mean": 5.0333514764133724e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.342405825103924e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15893.0,
+      "completions/mean_length": 6241.78125,
+      "completions/mean_terminated_length": 6161.92138671875,
+      "completions/min_length": 548.0,
+      "completions/min_terminated_length": 548.0,
+      "entropy": 1.0217387825250626,
+      "epoch": 0.21895124195032198,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0021239183843135834,
+      "learning_rate": 1e-05,
+      "loss": 0.0353,
+      "num_tokens": 188706605.0,
+      "reward": 0.2578125,
+      "reward_std": 0.3135277330875397,
+      "rewards/accuracy_reward/mean": 0.2578125,
+      "rewards/accuracy_reward/std": 0.43914902210235596,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999796748161316,
+      "sampling/importance_sampling_ratio/min": 0.004853047896176577,
+      "sampling/sampling_logp_difference/max": 5.328148365020752,
+      "sampling/sampling_logp_difference/mean": 0.02103862166404724,
+      "step": 238
+    },
+    {
+      "clip_ratio/high_max": 6.725130333506968e-06,
+      "clip_ratio/high_mean": 1.681282583376742e-06,
+      "clip_ratio/low_mean": 3.437372129155847e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.605500387493521e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15332.0,
+      "completions/mean_length": 5638.1328125,
+      "completions/mean_terminated_length": 5553.51953125,
+      "completions/min_length": 66.0,
+      "completions/min_terminated_length": 66.0,
+      "entropy": 0.7844365313649178,
+      "epoch": 0.21987120515179392,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0023868419229984283,
+      "learning_rate": 1e-05,
+      "loss": 0.0458,
+      "num_tokens": 189446294.0,
+      "reward": 0.515625,
+      "reward_std": 0.31246688961982727,
+      "rewards/accuracy_reward/mean": 0.515625,
+      "rewards/accuracy_reward/std": 0.5017194747924805,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000369548797607,
+      "sampling/importance_sampling_ratio/min": 0.0008047468145377934,
+      "sampling/sampling_logp_difference/max": 7.124982833862305,
+      "sampling/sampling_logp_difference/mean": 0.017401430755853653,
+      "step": 239
+    },
+    {
+      "clip_ratio/high_max": 2.887730215661577e-05,
+      "clip_ratio/high_mean": 7.219325539153942e-06,
+      "clip_ratio/low_mean": 2.826443028425274e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.548375502759882e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16196.0,
+      "completions/mean_length": 6374.8046875,
+      "completions/mean_terminated_length": 6215.9287109375,
+      "completions/min_length": 722.0,
+      "completions/min_terminated_length": 722.0,
+      "entropy": 0.9472770467400551,
+      "epoch": 0.22079116835326587,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0027549315709620714,
+      "learning_rate": 1e-05,
+      "loss": 0.0627,
+      "num_tokens": 190281461.0,
+      "reward": 0.3984375,
+      "reward_std": 0.3167053163051605,
+      "rewards/accuracy_reward/mean": 0.3984375,
+      "rewards/accuracy_reward/std": 0.4915000796318054,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998682737350464,
+      "sampling/importance_sampling_ratio/min": 7.100860239006579e-05,
+      "sampling/sampling_logp_difference/max": 9.552709579467773,
+      "sampling/sampling_logp_difference/mean": 0.020243138074874878,
+      "step": 240
+    },
+    {
+      "clip_ratio/high_max": 1.586787766427733e-05,
+      "clip_ratio/high_mean": 3.9669694160693325e-06,
+      "clip_ratio/low_mean": 2.978218674343225e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.374915604581474e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15097.0,
+      "completions/mean_length": 6654.21875,
+      "completions/mean_terminated_length": 6499.88134765625,
+      "completions/min_length": 246.0,
+      "completions/min_terminated_length": 246.0,
+      "entropy": 1.0028243213891983,
+      "epoch": 0.22171113155473782,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0013344973558560014,
+      "learning_rate": 1e-05,
+      "loss": 0.0184,
+      "num_tokens": 191156249.0,
+      "reward": 0.359375,
+      "reward_std": 0.22832971811294556,
+      "rewards/accuracy_reward/mean": 0.359375,
+      "rewards/accuracy_reward/std": 0.481702595949173,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999619722366333,
+      "sampling/importance_sampling_ratio/min": 0.0021875568199902773,
+      "sampling/sampling_logp_difference/max": 6.124969959259033,
+      "sampling/sampling_logp_difference/mean": 0.020470600575208664,
+      "step": 241
+    },
+    {
+      "clip_ratio/high_max": 1.681529829511419e-05,
+      "clip_ratio/high_mean": 4.9954849146160996e-06,
+      "clip_ratio/low_mean": 2.040554932136729e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.5401033553862362e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16172.0,
+      "completions/mean_length": 6767.7890625,
+      "completions/mean_terminated_length": 6537.00048828125,
+      "completions/min_length": 132.0,
+      "completions/min_terminated_length": 132.0,
+      "entropy": 0.9059296399354935,
+      "epoch": 0.22263109475620976,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0016136945923790336,
+      "learning_rate": 1e-05,
+      "loss": 0.0816,
+      "num_tokens": 192040526.0,
+      "reward": 0.4921875,
+      "reward_std": 0.2909066081047058,
+      "rewards/accuracy_reward/mean": 0.4921875,
+      "rewards/accuracy_reward/std": 0.5019033551216125,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999668598175049,
+      "sampling/importance_sampling_ratio/min": 1.2452921509975567e-05,
+      "sampling/sampling_logp_difference/max": 11.29355525970459,
+      "sampling/sampling_logp_difference/mean": 0.020058143883943558,
+      "step": 242
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 2.9821966563758906e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.9821966563758906e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 16275.0,
+      "completions/max_terminated_length": 16275.0,
+      "completions/mean_length": 6767.4921875,
+      "completions/mean_terminated_length": 6767.4921875,
+      "completions/min_length": 998.0,
+      "completions/min_terminated_length": 998.0,
+      "entropy": 1.0446822568774223,
+      "epoch": 0.22355105795768168,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.002869367366656661,
+      "learning_rate": 1e-05,
+      "loss": 0.0212,
+      "num_tokens": 192926469.0,
+      "reward": 0.3828125,
+      "reward_std": 0.2517249882221222,
+      "rewards/accuracy_reward/mean": 0.3828125,
+      "rewards/accuracy_reward/std": 0.4879830479621887,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999586343765259,
+      "sampling/importance_sampling_ratio/min": 1.9328599591972306e-05,
+      "sampling/sampling_logp_difference/max": 10.853924751281738,
+      "sampling/sampling_logp_difference/mean": 0.021512050181627274,
+      "step": 243
+    },
+    {
+      "clip_ratio/high_max": 3.44581130775623e-05,
+      "clip_ratio/high_mean": 1.3001711295146379e-05,
+      "clip_ratio/low_mean": 3.6407937841431703e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.940964981869911e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 16261.0,
+      "completions/max_terminated_length": 16261.0,
+      "completions/mean_length": 5738.484375,
+      "completions/mean_terminated_length": 5738.484375,
+      "completions/min_length": 184.0,
+      "completions/min_terminated_length": 184.0,
+      "entropy": 0.8617956340312958,
+      "epoch": 0.22447102115915363,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.002177527640014887,
+      "learning_rate": 1e-05,
+      "loss": -0.0189,
+      "num_tokens": 193678859.0,
+      "reward": 0.5546875,
+      "reward_std": 0.33220988512039185,
+      "rewards/accuracy_reward/mean": 0.5546875,
+      "rewards/accuracy_reward/std": 0.4989531338214874,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999570846557617,
+      "sampling/importance_sampling_ratio/min": 0.0008533780346624553,
+      "sampling/sampling_logp_difference/max": 7.06630802154541,
+      "sampling/sampling_logp_difference/mean": 0.018141131848096848,
+      "step": 244
+    },
+    {
+      "clip_ratio/high_max": 3.861003733618418e-06,
+      "clip_ratio/high_mean": 9.652509334046044e-07,
+      "clip_ratio/low_mean": 2.7767115511778684e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.8732366558870126e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15595.0,
+      "completions/mean_length": 6382.90625,
+      "completions/mean_terminated_length": 5976.357421875,
+      "completions/min_length": 464.0,
+      "completions/min_terminated_length": 464.0,
+      "entropy": 0.8692388981580734,
+      "epoch": 0.22539098436062557,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.004127771593630314,
+      "learning_rate": 1e-05,
+      "loss": 0.0572,
+      "num_tokens": 194511847.0,
+      "reward": 0.4140625,
+      "reward_std": 0.2767002582550049,
+      "rewards/accuracy_reward/mean": 0.4140625,
+      "rewards/accuracy_reward/std": 0.49449479579925537,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998810291290283,
+      "sampling/importance_sampling_ratio/min": 5.4239239943854045e-06,
+      "sampling/sampling_logp_difference/max": 12.124691009521484,
+      "sampling/sampling_logp_difference/mean": 0.018376430496573448,
+      "step": 245
+    },
+    {
+      "clip_ratio/high_max": 9.728395525598899e-06,
+      "clip_ratio/high_mean": 2.4320988813997246e-06,
+      "clip_ratio/low_mean": 5.3631663831765763e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.606376271316549e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 14504.0,
+      "completions/max_terminated_length": 14504.0,
+      "completions/mean_length": 5776.15625,
+      "completions/mean_terminated_length": 5776.15625,
+      "completions/min_length": 1018.0,
+      "completions/min_terminated_length": 1018.0,
+      "entropy": 1.1195004731416702,
+      "epoch": 0.22631094756209752,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.00263008801266551,
+      "learning_rate": 1e-05,
+      "loss": 0.0687,
+      "num_tokens": 195270051.0,
+      "reward": 0.421875,
+      "reward_std": 0.3618982434272766,
+      "rewards/accuracy_reward/mean": 0.421875,
+      "rewards/accuracy_reward/std": 0.4957992732524872,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999971866607666,
+      "sampling/importance_sampling_ratio/min": 0.005209421273320913,
+      "sampling/sampling_logp_difference/max": 5.257286548614502,
+      "sampling/sampling_logp_difference/mean": 0.019923292100429535,
+      "step": 246
+    },
+    {
+      "clip_ratio/high_max": 1.2701100786216557e-05,
+      "clip_ratio/high_mean": 3.1752751965541393e-06,
+      "clip_ratio/low_mean": 4.2162768181697174e-05,
+      "clip_ratio/low_min": 3.873926743835909e-06,
+      "clip_ratio/region_mean": 4.5338043378251314e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15203.0,
+      "completions/mean_length": 7411.421875,
+      "completions/mean_terminated_length": 7196.08056640625,
+      "completions/min_length": 455.0,
+      "completions/min_terminated_length": 455.0,
+      "entropy": 0.9801053553819656,
+      "epoch": 0.22723091076356947,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.002642859937623143,
+      "learning_rate": 1e-05,
+      "loss": 0.07,
+      "num_tokens": 196240913.0,
+      "reward": 0.390625,
+      "reward_std": 0.27328529953956604,
+      "rewards/accuracy_reward/mean": 0.390625,
+      "rewards/accuracy_reward/std": 0.4898075461387634,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999198913574219,
+      "sampling/importance_sampling_ratio/min": 0.00017500204558018595,
+      "sampling/sampling_logp_difference/max": 8.650712966918945,
+      "sampling/sampling_logp_difference/mean": 0.021511007100343704,
+      "step": 247
+    },
+    {
+      "clip_ratio/high_max": 1.5122936929401476e-05,
+      "clip_ratio/high_mean": 3.780734232350369e-06,
+      "clip_ratio/low_mean": 6.367217611114029e-05,
+      "clip_ratio/low_min": 4.8010447244450916e-06,
+      "clip_ratio/region_mean": 6.745291057086433e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16127.0,
+      "completions/mean_length": 7944.65625,
+      "completions/mean_terminated_length": 7742.1123046875,
+      "completions/min_length": 144.0,
+      "completions/min_terminated_length": 144.0,
+      "entropy": 1.0132562816143036,
+      "epoch": 0.2281508739650414,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.002439325675368309,
+      "learning_rate": 1e-05,
+      "loss": 0.0564,
+      "num_tokens": 197278517.0,
+      "reward": 0.34375,
+      "reward_std": 0.3161812424659729,
+      "rewards/accuracy_reward/mean": 0.34375,
+      "rewards/accuracy_reward/std": 0.47682511806488037,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999248385429382,
+      "sampling/importance_sampling_ratio/min": 1.0140610356756952e-05,
+      "sampling/sampling_logp_difference/max": 11.49896240234375,
+      "sampling/sampling_logp_difference/mean": 0.02124868705868721,
+      "step": 248
+    },
+    {
+      "clip_ratio/high_max": 2.6017536356448545e-05,
+      "clip_ratio/high_mean": 6.504384089112136e-06,
+      "clip_ratio/low_mean": 3.7791321346958284e-05,
+      "clip_ratio/low_min": 3.2110563097376144e-06,
+      "clip_ratio/region_mean": 4.429570503816649e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16317.0,
+      "completions/mean_length": 7550.0,
+      "completions/mean_terminated_length": 7409.7783203125,
+      "completions/min_length": 1469.0,
+      "completions/min_terminated_length": 1469.0,
+      "entropy": 1.0384011715650558,
+      "epoch": 0.22907083716651333,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0014879995724186301,
+      "learning_rate": 1e-05,
+      "loss": 0.0338,
+      "num_tokens": 198265589.0,
+      "reward": 0.3359375,
+      "reward_std": 0.24040167033672333,
+      "rewards/accuracy_reward/mean": 0.3359375,
+      "rewards/accuracy_reward/std": 0.47417303919792175,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999468922615051,
+      "sampling/importance_sampling_ratio/min": 8.418659126618877e-05,
+      "sampling/sampling_logp_difference/max": 9.382474899291992,
+      "sampling/sampling_logp_difference/mean": 0.021503347903490067,
+      "step": 249
+    },
+    {
+      "clip_ratio/high_max": 1.3615457191917812e-05,
+      "clip_ratio/high_mean": 4.491880531531933e-06,
+      "clip_ratio/low_mean": 3.916533574965797e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.365721684962409e-05,
+      "completions/clipped_ratio": 0.0703125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16221.0,
+      "completions/mean_length": 8140.9140625,
+      "completions/mean_terminated_length": 7517.48779296875,
+      "completions/min_length": 837.0,
+      "completions/min_terminated_length": 837.0,
+      "entropy": 0.8718572407960892,
+      "epoch": 0.22999080036798528,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.002340668346732855,
+      "learning_rate": 1e-05,
+      "loss": 0.0585,
+      "num_tokens": 199324938.0,
+      "reward": 0.453125,
+      "reward_std": 0.35824596881866455,
+      "rewards/accuracy_reward/mean": 0.453125,
+      "rewards/accuracy_reward/std": 0.4997538626194,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999454021453857,
+      "sampling/importance_sampling_ratio/min": 0.002325017238035798,
+      "sampling/sampling_logp_difference/max": 6.064027786254883,
+      "sampling/sampling_logp_difference/mean": 0.019466478377580643,
+      "step": 250
+    },
+    {
+      "clip_ratio/high_max": 2.2175697040438536e-05,
+      "clip_ratio/high_mean": 5.543924260109634e-06,
+      "clip_ratio/low_mean": 4.1318608055007644e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.686253225827386e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16263.0,
+      "completions/mean_length": 6630.96875,
+      "completions/mean_terminated_length": 6396.896484375,
+      "completions/min_length": 116.0,
+      "completions/min_terminated_length": 116.0,
+      "entropy": 0.7798146530985832,
+      "epoch": 0.23091076356945722,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.001989356242120266,
+      "learning_rate": 1e-05,
+      "loss": 0.0218,
+      "num_tokens": 200189902.0,
+      "reward": 0.5625,
+      "reward_std": 0.2987973093986511,
+      "rewards/accuracy_reward/mean": 0.5625,
+      "rewards/accuracy_reward/std": 0.49802759289741516,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999474883079529,
+      "sampling/importance_sampling_ratio/min": 0.0003315774374641478,
+      "sampling/sampling_logp_difference/max": 8.011649131774902,
+      "sampling/sampling_logp_difference/mean": 0.01849902793765068,
+      "step": 251
+    },
+    {
+      "clip_ratio/high_max": 3.325706302348408e-06,
+      "clip_ratio/high_mean": 8.31426575587102e-07,
+      "clip_ratio/low_mean": 2.0285911205064622e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.111733795118198e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15357.0,
+      "completions/max_terminated_length": 15357.0,
+      "completions/mean_length": 6582.203125,
+      "completions/mean_terminated_length": 6582.203125,
+      "completions/min_length": 593.0,
+      "completions/min_terminated_length": 593.0,
+      "entropy": 1.0181676000356674,
+      "epoch": 0.23183072677092917,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.002594445599243045,
+      "learning_rate": 1e-05,
+      "loss": 0.0232,
+      "num_tokens": 201052832.0,
+      "reward": 0.34375,
+      "reward_std": 0.25460314750671387,
+      "rewards/accuracy_reward/mean": 0.34375,
+      "rewards/accuracy_reward/std": 0.47682511806488037,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999495148658752,
+      "sampling/importance_sampling_ratio/min": 0.0003853558446280658,
+      "sampling/sampling_logp_difference/max": 7.8613433837890625,
+      "sampling/sampling_logp_difference/mean": 0.021598614752292633,
+      "step": 252
+    },
+    {
+      "clip_ratio/high_max": 2.2044430352252675e-05,
+      "clip_ratio/high_mean": 5.511107588063169e-06,
+      "clip_ratio/low_mean": 3.4155824209847196e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.96669319115972e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 14540.0,
+      "completions/max_terminated_length": 14540.0,
+      "completions/mean_length": 6145.1796875,
+      "completions/mean_terminated_length": 6145.1796875,
+      "completions/min_length": 1098.0,
+      "completions/min_terminated_length": 1098.0,
+      "entropy": 0.9084350541234016,
+      "epoch": 0.23275068997240111,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.003104996867477894,
+      "learning_rate": 1e-05,
+      "loss": 0.0811,
+      "num_tokens": 201858047.0,
+      "reward": 0.5078125,
+      "reward_std": 0.33220985531806946,
+      "rewards/accuracy_reward/mean": 0.5078125,
+      "rewards/accuracy_reward/std": 0.5019033551216125,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.000011682510376,
+      "sampling/importance_sampling_ratio/min": 0.007650630082935095,
+      "sampling/sampling_logp_difference/max": 4.87296724319458,
+      "sampling/sampling_logp_difference/mean": 0.018979094922542572,
+      "step": 253
+    },
+    {
+      "clip_ratio/high_max": 2.9959978519400465e-05,
+      "clip_ratio/high_mean": 7.489994629850116e-06,
+      "clip_ratio/low_mean": 3.5255963325653283e-05,
+      "clip_ratio/low_min": 2.973075879708631e-06,
+      "clip_ratio/region_mean": 4.274595892184152e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15745.0,
+      "completions/max_terminated_length": 15745.0,
+      "completions/mean_length": 7259.953125,
+      "completions/mean_terminated_length": 7259.953125,
+      "completions/min_length": 960.0,
+      "completions/min_terminated_length": 960.0,
+      "entropy": 0.9823614731431007,
+      "epoch": 0.23367065317387303,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.003212577663362026,
+      "learning_rate": 1e-05,
+      "loss": 0.0133,
+      "num_tokens": 202807673.0,
+      "reward": 0.4765625,
+      "reward_std": 0.3056321144104004,
+      "rewards/accuracy_reward/mean": 0.4765625,
+      "rewards/accuracy_reward/std": 0.5014128684997559,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999860405921936,
+      "sampling/importance_sampling_ratio/min": 0.000536504783667624,
+      "sampling/sampling_logp_difference/max": 7.530435085296631,
+      "sampling/sampling_logp_difference/mean": 0.021432969719171524,
+      "step": 254
+    },
+    {
+      "clip_ratio/high_max": 3.273996276220714e-05,
+      "clip_ratio/high_mean": 9.095591565255745e-06,
+      "clip_ratio/low_mean": 2.9539680099333054e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.8635271948805894e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16369.0,
+      "completions/mean_length": 7258.71875,
+      "completions/mean_terminated_length": 7113.87353515625,
+      "completions/min_length": 4.0,
+      "completions/min_terminated_length": 4.0,
+      "entropy": 0.8823810070753098,
+      "epoch": 0.23459061637534498,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.001418307889252901,
+      "learning_rate": 1e-05,
+      "loss": 0.0411,
+      "num_tokens": 203757333.0,
+      "reward": 0.40625,
+      "reward_std": 0.3048579692840576,
+      "rewards/accuracy_reward/mean": 0.40625,
+      "rewards/accuracy_reward/std": 0.4930621087551117,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999884963035583,
+      "sampling/importance_sampling_ratio/min": 0.0006408974295482039,
+      "sampling/sampling_logp_difference/max": 7.3526411056518555,
+      "sampling/sampling_logp_difference/mean": 0.019296500831842422,
+      "step": 255
+    },
+    {
+      "clip_ratio/high_max": 1.544119368190877e-05,
+      "clip_ratio/high_mean": 3.860298420477193e-06,
+      "clip_ratio/low_mean": 3.755458698151415e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.141488631148604e-05,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15605.0,
+      "completions/mean_length": 7011.40625,
+      "completions/mean_terminated_length": 6386.56689453125,
+      "completions/min_length": 685.0,
+      "completions/min_terminated_length": 685.0,
+      "entropy": 0.8057166337966919,
+      "epoch": 0.23551057957681693,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.001652427832596004,
+      "learning_rate": 1e-05,
+      "loss": 0.0459,
+      "num_tokens": 204675065.0,
+      "reward": 0.46875,
+      "reward_std": 0.24146251380443573,
+      "rewards/accuracy_reward/mean": 0.46875,
+      "rewards/accuracy_reward/std": 0.5009832978248596,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999918937683105,
+      "sampling/importance_sampling_ratio/min": 0.015319154597818851,
+      "sampling/sampling_logp_difference/max": 4.178651332855225,
+      "sampling/sampling_logp_difference/mean": 0.018787402659654617,
+      "step": 256
+    },
+    {
+      "clip_ratio/high_max": 5.222041181696113e-06,
+      "clip_ratio/high_mean": 2.209917965956265e-06,
+      "clip_ratio/low_mean": 4.0701652551433654e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.291157006264257e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 14796.0,
+      "completions/max_terminated_length": 14796.0,
+      "completions/mean_length": 6243.4296875,
+      "completions/mean_terminated_length": 6243.4296875,
+      "completions/min_length": 1023.0,
+      "completions/min_terminated_length": 1023.0,
+      "entropy": 0.9856048971414566,
+      "epoch": 0.23643054277828887,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.001482579973526299,
+      "learning_rate": 1e-05,
+      "loss": 0.0677,
+      "num_tokens": 205494344.0,
+      "reward": 0.5390625,
+      "reward_std": 0.28930407762527466,
+      "rewards/accuracy_reward/mean": 0.5390625,
+      "rewards/accuracy_reward/std": 0.5004304051399231,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998942613601685,
+      "sampling/importance_sampling_ratio/min": 0.0004254466330166906,
+      "sampling/sampling_logp_difference/max": 7.762371063232422,
+      "sampling/sampling_logp_difference/mean": 0.019727632403373718,
+      "step": 257
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 6.842733455414418e-05,
+      "clip_ratio/low_min": 9.297655878981459e-06,
+      "clip_ratio/region_mean": 6.842733455414418e-05,
+      "completions/clipped_ratio": 0.0546875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15485.0,
+      "completions/mean_length": 7122.2421875,
+      "completions/mean_terminated_length": 6586.4375,
+      "completions/min_length": 445.0,
+      "completions/min_terminated_length": 445.0,
+      "entropy": 0.8625433370471001,
+      "epoch": 0.23735050597976082,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.002006452763453126,
+      "learning_rate": 1e-05,
+      "loss": 0.0312,
+      "num_tokens": 206428775.0,
+      "reward": 0.40625,
+      "reward_std": 0.2987973093986511,
+      "rewards/accuracy_reward/mean": 0.40625,
+      "rewards/accuracy_reward/std": 0.4930621087551117,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999338388442993,
+      "sampling/importance_sampling_ratio/min": 0.00010911409481195733,
+      "sampling/sampling_logp_difference/max": 9.123116493225098,
+      "sampling/sampling_logp_difference/mean": 0.01927522011101246,
+      "step": 258
+    },
+    {
+      "clip_ratio/high_max": 2.887607206503162e-05,
+      "clip_ratio/high_mean": 7.219018016257905e-06,
+      "clip_ratio/low_mean": 2.7790995090981596e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.501001378936053e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15254.0,
+      "completions/mean_length": 7965.2734375,
+      "completions/mean_terminated_length": 7623.6826171875,
+      "completions/min_length": 399.0,
+      "completions/min_terminated_length": 399.0,
+      "entropy": 1.0068430602550507,
+      "epoch": 0.23827046918123276,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0029176415409892797,
+      "learning_rate": 1e-05,
+      "loss": 0.0179,
+      "num_tokens": 207469586.0,
+      "reward": 0.3828125,
+      "reward_std": 0.2212003916501999,
+      "rewards/accuracy_reward/mean": 0.3828125,
+      "rewards/accuracy_reward/std": 0.4879830479621887,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998945593833923,
+      "sampling/importance_sampling_ratio/min": 4.06005028708023e-06,
+      "sampling/sampling_logp_difference/max": 12.414315223693848,
+      "sampling/sampling_logp_difference/mean": 0.02198987640440464,
+      "step": 259
+    },
+    {
+      "clip_ratio/high_max": 8.710998599781306e-06,
+      "clip_ratio/high_mean": 2.1777496499453264e-06,
+      "clip_ratio/low_mean": 4.1899779091636447e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.407752874158177e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15331.0,
+      "completions/mean_length": 6329.4296875,
+      "completions/mean_terminated_length": 6169.83349609375,
+      "completions/min_length": 160.0,
+      "completions/min_terminated_length": 160.0,
+      "entropy": 0.9399363100528717,
+      "epoch": 0.23919043238270468,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0019115234026685357,
+      "learning_rate": 1e-05,
+      "loss": 0.0399,
+      "num_tokens": 208300217.0,
+      "reward": 0.4375,
+      "reward_std": 0.2580229938030243,
+      "rewards/accuracy_reward/mean": 0.4375,
+      "rewards/accuracy_reward/std": 0.49802759289741516,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000567436218262,
+      "sampling/importance_sampling_ratio/min": 2.1449603082146496e-05,
+      "sampling/sampling_logp_difference/max": 10.749804496765137,
+      "sampling/sampling_logp_difference/mean": 0.020002204924821854,
+      "step": 260
+    },
+    {
+      "clip_ratio/high_max": 2.536784450057894e-05,
+      "clip_ratio/high_mean": 6.341961125144735e-06,
+      "clip_ratio/low_mean": 5.959111433639919e-05,
+      "clip_ratio/low_min": 1.1521060741870315e-05,
+      "clip_ratio/region_mean": 6.593307591629127e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15635.0,
+      "completions/mean_length": 6747.90625,
+      "completions/mean_terminated_length": 6594.95263671875,
+      "completions/min_length": 350.0,
+      "completions/min_terminated_length": 350.0,
+      "entropy": 0.9575144425034523,
+      "epoch": 0.24011039558417663,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.003766207257285714,
+      "learning_rate": 1e-05,
+      "loss": 0.0667,
+      "num_tokens": 209181077.0,
+      "reward": 0.4375,
+      "reward_std": 0.3164137303829193,
+      "rewards/accuracy_reward/mean": 0.4375,
+      "rewards/accuracy_reward/std": 0.49802759289741516,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999313354492188,
+      "sampling/importance_sampling_ratio/min": 1.250743298442103e-05,
+      "sampling/sampling_logp_difference/max": 11.28918743133545,
+      "sampling/sampling_logp_difference/mean": 0.020067427307367325,
+      "step": 261
+    },
+    {
+      "clip_ratio/high_max": 2.0626074274332495e-05,
+      "clip_ratio/high_mean": 5.156518568583124e-06,
+      "clip_ratio/low_mean": 5.808068385704246e-05,
+      "clip_ratio/low_min": 1.0360539818066172e-05,
+      "clip_ratio/region_mean": 6.32372018571914e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16218.0,
+      "completions/mean_length": 6426.6953125,
+      "completions/mean_terminated_length": 6348.29150390625,
+      "completions/min_length": 767.0,
+      "completions/min_terminated_length": 767.0,
+      "entropy": 0.87480478733778,
+      "epoch": 0.24103035878564857,
+      "frac_reward_zero_std": 0.125,
+      "grad_norm": 0.002375675830990076,
+      "learning_rate": 1e-05,
+      "loss": 0.0752,
+      "num_tokens": 210023702.0,
+      "reward": 0.5078125,
+      "reward_std": 0.38900789618492126,
+      "rewards/accuracy_reward/mean": 0.5078125,
+      "rewards/accuracy_reward/std": 0.5019033551216125,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999383687973022,
+      "sampling/importance_sampling_ratio/min": 0.00024259372730739415,
+      "sampling/sampling_logp_difference/max": 8.324122428894043,
+      "sampling/sampling_logp_difference/mean": 0.018864646553993225,
+      "step": 262
+    },
+    {
+      "clip_ratio/high_max": 4.462851393327583e-06,
+      "clip_ratio/high_mean": 1.1157128483318957e-06,
+      "clip_ratio/low_mean": 3.8966268334661436e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.008198141036701e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16022.0,
+      "completions/mean_length": 7223.1484375,
+      "completions/mean_terminated_length": 6927.63671875,
+      "completions/min_length": 1015.0,
+      "completions/min_terminated_length": 1015.0,
+      "entropy": 1.0218688547611237,
+      "epoch": 0.24195032198712052,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0016257674433290958,
+      "learning_rate": 1e-05,
+      "loss": 0.0791,
+      "num_tokens": 210969921.0,
+      "reward": 0.4609375,
+      "reward_std": 0.2896084189414978,
+      "rewards/accuracy_reward/mean": 0.4609375,
+      "rewards/accuracy_reward/std": 0.5004304051399231,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999514818191528,
+      "sampling/importance_sampling_ratio/min": 9.193710138788447e-05,
+      "sampling/sampling_logp_difference/max": 9.294405937194824,
+      "sampling/sampling_logp_difference/mean": 0.02119653858244419,
+      "step": 263
+    },
+    {
+      "clip_ratio/high_max": 1.2653464409595472e-05,
+      "clip_ratio/high_mean": 3.163366102398868e-06,
+      "clip_ratio/low_mean": 4.864477250521304e-05,
+      "clip_ratio/low_min": 8.641252861707471e-06,
+      "clip_ratio/region_mean": 5.1808138323394815e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15180.0,
+      "completions/max_terminated_length": 15180.0,
+      "completions/mean_length": 6974.0703125,
+      "completions/mean_terminated_length": 6974.0703125,
+      "completions/min_length": 2.0,
+      "completions/min_terminated_length": 2.0,
+      "entropy": 0.9265539348125458,
+      "epoch": 0.24287028518859247,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0023448490537703037,
+      "learning_rate": 1e-05,
+      "loss": 0.0567,
+      "num_tokens": 211884866.0,
+      "reward": 0.390625,
+      "reward_std": 0.2885475754737854,
+      "rewards/accuracy_reward/mean": 0.390625,
+      "rewards/accuracy_reward/std": 0.4898075461387634,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000474452972412,
+      "sampling/importance_sampling_ratio/min": 0.0007677432149648666,
+      "sampling/sampling_logp_difference/max": 7.172055244445801,
+      "sampling/sampling_logp_difference/mean": 0.020384611561894417,
+      "step": 264
+    },
+    {
+      "clip_ratio/high_max": 1.1967917316724197e-05,
+      "clip_ratio/high_mean": 2.9919793291810493e-06,
+      "clip_ratio/low_mean": 3.179497366545547e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.478695157355105e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15657.0,
+      "completions/mean_length": 7247.2734375,
+      "completions/mean_terminated_length": 7027.9921875,
+      "completions/min_length": 103.0,
+      "completions/min_terminated_length": 103.0,
+      "entropy": 0.9756898358464241,
+      "epoch": 0.24379024839006438,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.003212807234376669,
+      "learning_rate": 1e-05,
+      "loss": 0.0484,
+      "num_tokens": 212833933.0,
+      "reward": 0.328125,
+      "reward_std": 0.2398776412010193,
+      "rewards/accuracy_reward/mean": 0.328125,
+      "rewards/accuracy_reward/std": 0.4713755249977112,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999449253082275,
+      "sampling/importance_sampling_ratio/min": 0.001600456889718771,
+      "sampling/sampling_logp_difference/max": 6.437466144561768,
+      "sampling/sampling_logp_difference/mean": 0.0199666079133749,
+      "step": 265
+    },
+    {
+      "clip_ratio/high_max": 1.1404694760130951e-05,
+      "clip_ratio/high_mean": 3.887520392709121e-06,
+      "clip_ratio/low_mean": 4.0242122167910566e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.4129643583801226e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15575.0,
+      "completions/mean_length": 7773.9296875,
+      "completions/mean_terminated_length": 7423.9267578125,
+      "completions/min_length": 568.0,
+      "completions/min_terminated_length": 568.0,
+      "entropy": 0.9765531942248344,
+      "epoch": 0.24471021159153633,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0019600428640842438,
+      "learning_rate": 1e-05,
+      "loss": 0.0357,
+      "num_tokens": 213848508.0,
+      "reward": 0.3984375,
+      "reward_std": 0.3129909336566925,
+      "rewards/accuracy_reward/mean": 0.3984375,
+      "rewards/accuracy_reward/std": 0.4915000796318054,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000061988830566,
+      "sampling/importance_sampling_ratio/min": 2.430168751743622e-05,
+      "sampling/sampling_logp_difference/max": 10.624964714050293,
+      "sampling/sampling_logp_difference/mean": 0.020565161481499672,
+      "step": 266
+    },
+    {
+      "clip_ratio/high_max": 6.725708999510971e-06,
+      "clip_ratio/high_mean": 1.6814272498777427e-06,
+      "clip_ratio/low_mean": 2.869901106805628e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.0380438261090603e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15583.0,
+      "completions/mean_length": 6722.5,
+      "completions/mean_terminated_length": 6569.14306640625,
+      "completions/min_length": 1021.0,
+      "completions/min_terminated_length": 1021.0,
+      "entropy": 0.9291529878973961,
+      "epoch": 0.24563017479300828,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.0014550165506079793,
+      "learning_rate": 1e-05,
+      "loss": 0.0235,
+      "num_tokens": 214731180.0,
+      "reward": 0.4921875,
+      "reward_std": 0.19332444667816162,
+      "rewards/accuracy_reward/mean": 0.4921875,
+      "rewards/accuracy_reward/std": 0.5019033551216125,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999178647994995,
+      "sampling/importance_sampling_ratio/min": 0.007400285452604294,
+      "sampling/sampling_logp_difference/max": 4.90623664855957,
+      "sampling/sampling_logp_difference/mean": 0.020057080313563347,
+      "step": 267
+    },
+    {
+      "clip_ratio/high_max": 1.8797170469042612e-05,
+      "clip_ratio/high_mean": 6.827749643889547e-06,
+      "clip_ratio/low_mean": 3.448591337473772e-05,
+      "clip_ratio/low_min": 4.687090040533803e-06,
+      "clip_ratio/region_mean": 4.1313662677566754e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15797.0,
+      "completions/max_terminated_length": 15797.0,
+      "completions/mean_length": 7001.8671875,
+      "completions/mean_terminated_length": 7001.8671875,
+      "completions/min_length": 930.0,
+      "completions/min_terminated_length": 930.0,
+      "entropy": 1.0746883526444435,
+      "epoch": 0.24655013799448022,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.002483292715623975,
+      "learning_rate": 1e-05,
+      "loss": 0.048,
+      "num_tokens": 215645819.0,
+      "reward": 0.3515625,
+      "reward_std": 0.32955142855644226,
+      "rewards/accuracy_reward/mean": 0.3515625,
+      "rewards/accuracy_reward/std": 0.4793342351913452,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999587535858154,
+      "sampling/importance_sampling_ratio/min": 1.0195622053288389e-05,
+      "sampling/sampling_logp_difference/max": 11.493552207946777,
+      "sampling/sampling_logp_difference/mean": 0.020808640867471695,
+      "step": 268
+    },
+    {
+      "clip_ratio/high_max": 8.748068921704544e-06,
+      "clip_ratio/high_mean": 2.187017230426136e-06,
+      "clip_ratio/low_mean": 8.762007928453386e-05,
+      "clip_ratio/low_min": 2.3698836685071e-05,
+      "clip_ratio/region_mean": 8.980709480965743e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14189.0,
+      "completions/mean_length": 6663.796875,
+      "completions/mean_terminated_length": 6509.50830078125,
+      "completions/min_length": 1148.0,
+      "completions/min_terminated_length": 1148.0,
+      "entropy": 1.0000900849699974,
+      "epoch": 0.24747010119595217,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.0015696679474785924,
+      "learning_rate": 1e-05,
+      "loss": 0.0731,
+      "num_tokens": 216519369.0,
+      "reward": 0.3671875,
+      "reward_std": 0.3214311897754669,
+      "rewards/accuracy_reward/mean": 0.3671875,
+      "rewards/accuracy_reward/std": 0.4839322865009308,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9997707605361938,
+      "sampling/importance_sampling_ratio/min": 1.288027192458685e-06,
+      "sampling/sampling_logp_difference/max": 13.562398910522461,
+      "sampling/sampling_logp_difference/mean": 0.022182684391736984,
+      "step": 269
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 4.567897690321843e-05,
+      "clip_ratio/low_min": 3.287224444648018e-06,
+      "clip_ratio/region_mean": 4.567897690321843e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16381.0,
+      "completions/mean_length": 6978.7421875,
+      "completions/mean_terminated_length": 6829.45263671875,
+      "completions/min_length": 1661.0,
+      "completions/min_terminated_length": 1661.0,
+      "entropy": 1.0845019966363907,
+      "epoch": 0.24839006439742412,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.003887100610882044,
+      "learning_rate": 1e-05,
+      "loss": 0.1076,
+      "num_tokens": 217432432.0,
+      "reward": 0.3671875,
+      "reward_std": 0.3124619722366333,
+      "rewards/accuracy_reward/mean": 0.3671875,
+      "rewards/accuracy_reward/std": 0.4839322865009308,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999902248382568,
+      "sampling/importance_sampling_ratio/min": 0.02168075367808342,
+      "sampling/sampling_logp_difference/max": 3.8313302993774414,
+      "sampling/sampling_logp_difference/mean": 0.02127157337963581,
+      "step": 270
+    },
+    {
+      "clip_ratio/high_max": 2.444328310957644e-05,
+      "clip_ratio/high_mean": 6.11082077739411e-06,
+      "clip_ratio/low_mean": 5.1527222922231886e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.7638043699625996e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15638.0,
+      "completions/mean_length": 5903.5546875,
+      "completions/mean_terminated_length": 5652.0244140625,
+      "completions/min_length": 651.0,
+      "completions/min_terminated_length": 651.0,
+      "entropy": 0.8638224303722382,
+      "epoch": 0.24931002759889603,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.002851828932762146,
+      "learning_rate": 1e-05,
+      "loss": 0.0771,
+      "num_tokens": 218208399.0,
+      "reward": 0.4453125,
+      "reward_std": 0.3713914752006531,
+      "rewards/accuracy_reward/mean": 0.4453125,
+      "rewards/accuracy_reward/std": 0.4989531338214874,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000553131103516,
+      "sampling/importance_sampling_ratio/min": 0.000626727007329464,
+      "sampling/sampling_logp_difference/max": 7.374999523162842,
+      "sampling/sampling_logp_difference/mean": 0.01880766451358795,
+      "step": 271
+    },
+    {
+      "clip_ratio/high_max": 8.474872856822913e-06,
+      "clip_ratio/high_mean": 2.118718214205728e-06,
+      "clip_ratio/low_mean": 2.5821682072546537e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.794040096887329e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 16060.0,
+      "completions/max_terminated_length": 16060.0,
+      "completions/mean_length": 5596.7109375,
+      "completions/mean_terminated_length": 5596.7109375,
+      "completions/min_length": 87.0,
+      "completions/min_terminated_length": 87.0,
+      "entropy": 1.1127397641539574,
+      "epoch": 0.250229990800368,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0018005800666287541,
+      "learning_rate": 1e-05,
+      "loss": 0.0075,
+      "num_tokens": 218944418.0,
+      "reward": 0.4375,
+      "reward_std": 0.29485049843788147,
+      "rewards/accuracy_reward/mean": 0.4375,
+      "rewards/accuracy_reward/std": 0.49802759289741516,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000048875808716,
+      "sampling/importance_sampling_ratio/min": 0.01548748929053545,
+      "sampling/sampling_logp_difference/max": 4.167722702026367,
+      "sampling/sampling_logp_difference/mean": 0.02004322223365307,
+      "step": 272
+    },
+    {
+      "clip_ratio/high_max": 1.5034628631838132e-05,
+      "clip_ratio/high_mean": 4.925485768580984e-06,
+      "clip_ratio/low_mean": 3.539464648838475e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.032013237065257e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16362.0,
+      "completions/mean_length": 7120.109375,
+      "completions/mean_terminated_length": 7047.16552734375,
+      "completions/min_length": 816.0,
+      "completions/min_terminated_length": 816.0,
+      "entropy": 1.0697019025683403,
+      "epoch": 0.2511499540018399,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0022711476776748896,
+      "learning_rate": 1e-05,
+      "loss": 0.0126,
+      "num_tokens": 219875952.0,
+      "reward": 0.2734375,
+      "reward_std": 0.23751862347126007,
+      "rewards/accuracy_reward/mean": 0.2734375,
+      "rewards/accuracy_reward/std": 0.447474867105484,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.000036358833313,
+      "sampling/importance_sampling_ratio/min": 9.733050683280453e-05,
+      "sampling/sampling_logp_difference/max": 9.237398147583008,
+      "sampling/sampling_logp_difference/mean": 0.02110595628619194,
+      "step": 273
+    },
+    {
+      "clip_ratio/high_max": 1.0558468147792155e-05,
+      "clip_ratio/high_mean": 2.6396170369480387e-06,
+      "clip_ratio/low_mean": 3.796903268948881e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.060864915800266e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15745.0,
+      "completions/mean_length": 7623.953125,
+      "completions/mean_terminated_length": 7484.9052734375,
+      "completions/min_length": 372.0,
+      "completions/min_terminated_length": 372.0,
+      "entropy": 0.8836525157094002,
+      "epoch": 0.25206991720331184,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.002609838731586933,
+      "learning_rate": 1e-05,
+      "loss": 0.0563,
+      "num_tokens": 220871730.0,
+      "reward": 0.3046875,
+      "reward_std": 0.30061954259872437,
+      "rewards/accuracy_reward/mean": 0.3046875,
+      "rewards/accuracy_reward/std": 0.46208351850509644,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999877214431763,
+      "sampling/importance_sampling_ratio/min": 0.0015448236372321844,
+      "sampling/sampling_logp_difference/max": 6.472845554351807,
+      "sampling/sampling_logp_difference/mean": 0.019322458654642105,
+      "step": 274
+    },
+    {
+      "clip_ratio/high_max": 1.144785210271948e-05,
+      "clip_ratio/high_mean": 2.86196302567987e-06,
+      "clip_ratio/low_mean": 5.795533934360719e-05,
+      "clip_ratio/low_min": 4.49300887339632e-06,
+      "clip_ratio/region_mean": 6.081730361984228e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15557.0,
+      "completions/mean_length": 6778.71875,
+      "completions/mean_terminated_length": 6703.08642578125,
+      "completions/min_length": 1187.0,
+      "completions/min_terminated_length": 1187.0,
+      "entropy": 0.8968989998102188,
+      "epoch": 0.2529898804047838,
+      "frac_reward_zero_std": 0.0625,
+      "grad_norm": 0.00395589042454958,
+      "learning_rate": 1e-05,
+      "loss": 0.0538,
+      "num_tokens": 221761214.0,
+      "reward": 0.4921875,
+      "reward_std": 0.4032142758369446,
+      "rewards/accuracy_reward/mean": 0.4921875,
+      "rewards/accuracy_reward/std": 0.5019033551216125,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000214576721191,
+      "sampling/importance_sampling_ratio/min": 0.0011724763317033648,
+      "sampling/sampling_logp_difference/max": 6.7486371994018555,
+      "sampling/sampling_logp_difference/mean": 0.018937086686491966,
+      "step": 275
+    },
+    {
+      "clip_ratio/high_max": 2.708495139813749e-05,
+      "clip_ratio/high_mean": 7.628764933542698e-06,
+      "clip_ratio/low_mean": 3.0297362627607072e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.792612744746293e-05,
+      "completions/clipped_ratio": 0.0546875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16100.0,
+      "completions/mean_length": 7319.2578125,
+      "completions/mean_terminated_length": 6794.85107421875,
+      "completions/min_length": 1034.0,
+      "completions/min_terminated_length": 1034.0,
+      "entropy": 0.870811752974987,
+      "epoch": 0.25390984360625574,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.002063714899122715,
+      "learning_rate": 1e-05,
+      "loss": 0.0271,
+      "num_tokens": 222719287.0,
+      "reward": 0.3203125,
+      "reward_std": 0.2835301160812378,
+      "rewards/accuracy_reward/mean": 0.3203125,
+      "rewards/accuracy_reward/std": 0.4684300124645233,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999525547027588,
+      "sampling/importance_sampling_ratio/min": 2.13631665246794e-05,
+      "sampling/sampling_logp_difference/max": 10.7538423538208,
+      "sampling/sampling_logp_difference/mean": 0.019336167722940445,
+      "step": 276
+    },
+    {
+      "clip_ratio/high_max": 3.860288416035473e-06,
+      "clip_ratio/high_mean": 9.650721040088683e-07,
+      "clip_ratio/low_mean": 2.303871349340625e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.4003785597415117e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16326.0,
+      "completions/mean_length": 6207.4140625,
+      "completions/mean_terminated_length": 5879.13671875,
+      "completions/min_length": 752.0,
+      "completions/min_terminated_length": 752.0,
+      "entropy": 0.8348869979381561,
+      "epoch": 0.2548298068077277,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0023463829420506954,
+      "learning_rate": 1e-05,
+      "loss": 0.0696,
+      "num_tokens": 223533372.0,
+      "reward": 0.4375,
+      "reward_std": 0.2359210103750229,
+      "rewards/accuracy_reward/mean": 0.4375,
+      "rewards/accuracy_reward/std": 0.49802759289741516,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000433921813965,
+      "sampling/importance_sampling_ratio/min": 2.1447433027788065e-05,
+      "sampling/sampling_logp_difference/max": 10.749905586242676,
+      "sampling/sampling_logp_difference/mean": 0.018392907455563545,
+      "step": 277
+    },
+    {
+      "clip_ratio/high_max": 2.1441665467136772e-05,
+      "clip_ratio/high_mean": 5.360416366784193e-06,
+      "clip_ratio/low_mean": 5.504566888703266e-05,
+      "clip_ratio/low_min": 1.2581466762640048e-05,
+      "clip_ratio/region_mean": 6.040608514013002e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 14713.0,
+      "completions/max_terminated_length": 14713.0,
+      "completions/mean_length": 6417.2109375,
+      "completions/mean_terminated_length": 6417.2109375,
+      "completions/min_length": 981.0,
+      "completions/min_terminated_length": 981.0,
+      "entropy": 1.0232173576951027,
+      "epoch": 0.25574977000919963,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0033652919810265303,
+      "learning_rate": 1e-05,
+      "loss": 0.034,
+      "num_tokens": 224375711.0,
+      "reward": 0.390625,
+      "reward_std": 0.3169426918029785,
+      "rewards/accuracy_reward/mean": 0.390625,
+      "rewards/accuracy_reward/std": 0.4898075461387634,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999820590019226,
+      "sampling/importance_sampling_ratio/min": 0.0020559614058583975,
+      "sampling/sampling_logp_difference/max": 6.18701171875,
+      "sampling/sampling_logp_difference/mean": 0.020980924367904663,
+      "step": 278
+    },
+    {
+      "clip_ratio/high_max": 4.679544872487895e-06,
+      "clip_ratio/high_mean": 1.1698862181219738e-06,
+      "clip_ratio/low_mean": 2.818696702888701e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.9356853247008985e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15026.0,
+      "completions/max_terminated_length": 15026.0,
+      "completions/mean_length": 5275.9453125,
+      "completions/mean_terminated_length": 5275.9453125,
+      "completions/min_length": 473.0,
+      "completions/min_terminated_length": 473.0,
+      "entropy": 0.8563915193080902,
+      "epoch": 0.25666973321067155,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0025473968125879765,
+      "learning_rate": 1e-05,
+      "loss": 0.0813,
+      "num_tokens": 225070992.0,
+      "reward": 0.703125,
+      "reward_std": 0.2790592610836029,
+      "rewards/accuracy_reward/mean": 0.703125,
+      "rewards/accuracy_reward/std": 0.45867621898651123,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999873042106628,
+      "sampling/importance_sampling_ratio/min": 0.0010016229934990406,
+      "sampling/sampling_logp_difference/max": 6.906133651733398,
+      "sampling/sampling_logp_difference/mean": 0.018068701028823853,
+      "step": 279
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 4.1973035422415705e-05,
+      "clip_ratio/low_min": 6.267234766710317e-06,
+      "clip_ratio/region_mean": 4.1973035422415705e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16350.0,
+      "completions/mean_length": 7693.984375,
+      "completions/mean_terminated_length": 7556.0478515625,
+      "completions/min_length": 1349.0,
+      "completions/min_terminated_length": 1349.0,
+      "entropy": 0.7832933664321899,
+      "epoch": 0.2575896964121435,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0016663498245179653,
+      "learning_rate": 1e-05,
+      "loss": 0.0836,
+      "num_tokens": 226073822.0,
+      "reward": 0.421875,
+      "reward_std": 0.3227166533470154,
+      "rewards/accuracy_reward/mean": 0.421875,
+      "rewards/accuracy_reward/std": 0.4957992732524872,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999273419380188,
+      "sampling/importance_sampling_ratio/min": 5.893720299354754e-06,
+      "sampling/sampling_logp_difference/max": 12.04162311553955,
+      "sampling/sampling_logp_difference/mean": 0.01851016655564308,
+      "step": 280
+    },
+    {
+      "clip_ratio/high_max": 1.304801662627142e-05,
+      "clip_ratio/high_mean": 3.262004156567855e-06,
+      "clip_ratio/low_mean": 3.7096169648975774e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.035817426029098e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15207.0,
+      "completions/mean_length": 6222.4609375,
+      "completions/mean_terminated_length": 6061.1669921875,
+      "completions/min_length": 967.0,
+      "completions/min_terminated_length": 967.0,
+      "entropy": 0.8835120126605034,
+      "epoch": 0.25850965961361544,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0021045261528342962,
+      "learning_rate": 1e-05,
+      "loss": 0.055,
+      "num_tokens": 226888577.0,
+      "reward": 0.5078125,
+      "reward_std": 0.2767002284526825,
+      "rewards/accuracy_reward/mean": 0.5078125,
+      "rewards/accuracy_reward/std": 0.5019033551216125,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999616742134094,
+      "sampling/importance_sampling_ratio/min": 5.688065698450373e-07,
+      "sampling/sampling_logp_difference/max": 14.379725456237793,
+      "sampling/sampling_logp_difference/mean": 0.018851105123758316,
+      "step": 281
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 3.1754828114571865e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.1754828114571865e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16331.0,
+      "completions/mean_length": 6371.625,
+      "completions/mean_terminated_length": 6131.328125,
+      "completions/min_length": 1034.0,
+      "completions/min_terminated_length": 1034.0,
+      "entropy": 0.9026313945651054,
+      "epoch": 0.2594296228150874,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0030448357574641705,
+      "learning_rate": 1e-05,
+      "loss": 0.1009,
+      "num_tokens": 227722025.0,
+      "reward": 0.515625,
+      "reward_std": 0.2722293734550476,
+      "rewards/accuracy_reward/mean": 0.515625,
+      "rewards/accuracy_reward/std": 0.5017194747924805,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999712705612183,
+      "sampling/importance_sampling_ratio/min": 0.00016869648243300617,
+      "sampling/sampling_logp_difference/max": 8.687409400939941,
+      "sampling/sampling_logp_difference/mean": 0.018757576122879982,
+      "step": 282
+    },
+    {
+      "clip_ratio/high_max": 7.024085562079563e-06,
+      "clip_ratio/high_mean": 1.7560213905198907e-06,
+      "clip_ratio/low_mean": 3.379111592494155e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.5547137599678535e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15891.0,
+      "completions/mean_length": 7510.4921875,
+      "completions/mean_terminated_length": 7224.25,
+      "completions/min_length": 159.0,
+      "completions/min_terminated_length": 159.0,
+      "entropy": 1.044313833117485,
+      "epoch": 0.26034958601655933,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.0019295766251161695,
+      "learning_rate": 1e-05,
+      "loss": 0.0513,
+      "num_tokens": 228703256.0,
+      "reward": 0.3046875,
+      "reward_std": 0.19674429297447205,
+      "rewards/accuracy_reward/mean": 0.3046875,
+      "rewards/accuracy_reward/std": 0.46208351850509644,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999972581863403,
+      "sampling/importance_sampling_ratio/min": 0.0002186153142247349,
+      "sampling/sampling_logp_difference/max": 8.428196907043457,
+      "sampling/sampling_logp_difference/mean": 0.02207346074283123,
+      "step": 283
+    },
+    {
+      "clip_ratio/high_max": 5.068321115686558e-06,
+      "clip_ratio/high_mean": 1.2670802789216395e-06,
+      "clip_ratio/low_mean": 3.7797102550030104e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.9064182828951743e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16007.0,
+      "completions/mean_length": 7594.140625,
+      "completions/mean_terminated_length": 7524.92919921875,
+      "completions/min_length": 598.0,
+      "completions/min_terminated_length": 598.0,
+      "entropy": 0.9706612005829811,
+      "epoch": 0.2612695492180313,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.0017117204843088984,
+      "learning_rate": 1e-05,
+      "loss": 0.0748,
+      "num_tokens": 229697002.0,
+      "reward": 0.2734375,
+      "reward_std": 0.18649455904960632,
+      "rewards/accuracy_reward/mean": 0.2734375,
+      "rewards/accuracy_reward/std": 0.447474867105484,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.000016212463379,
+      "sampling/importance_sampling_ratio/min": 0.00035400164779275656,
+      "sampling/sampling_logp_difference/max": 7.946208953857422,
+      "sampling/sampling_logp_difference/mean": 0.021097885444760323,
+      "step": 284
+    },
+    {
+      "clip_ratio/high_max": 1.5618601537426002e-05,
+      "clip_ratio/high_mean": 3.904650384356501e-06,
+      "clip_ratio/low_mean": 4.570582996166195e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.961048034601845e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15562.0,
+      "completions/mean_length": 6888.9140625,
+      "completions/mean_terminated_length": 6738.19873046875,
+      "completions/min_length": 327.0,
+      "completions/min_terminated_length": 327.0,
+      "entropy": 0.9210037142038345,
+      "epoch": 0.2621895124195032,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0025933689903467894,
+      "learning_rate": 1e-05,
+      "loss": 0.0887,
+      "num_tokens": 230598679.0,
+      "reward": 0.4375,
+      "reward_std": 0.2580229938030243,
+      "rewards/accuracy_reward/mean": 0.4375,
+      "rewards/accuracy_reward/std": 0.49802759289741516,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999586939811707,
+      "sampling/importance_sampling_ratio/min": 0.0007308972999453545,
+      "sampling/sampling_logp_difference/max": 7.221237659454346,
+      "sampling/sampling_logp_difference/mean": 0.01939917542040348,
+      "step": 285
+    },
+    {
+      "clip_ratio/high_max": 2.398964193162101e-05,
+      "clip_ratio/high_mean": 6.9283565835576155e-06,
+      "clip_ratio/low_mean": 4.821338916372042e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.514174608833855e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15458.0,
+      "completions/mean_length": 6433.640625,
+      "completions/mean_terminated_length": 6355.29150390625,
+      "completions/min_length": 213.0,
+      "completions/min_terminated_length": 213.0,
+      "entropy": 1.064419962465763,
+      "epoch": 0.26310947562097514,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.0019397985888645053,
+      "learning_rate": 1e-05,
+      "loss": 0.0841,
+      "num_tokens": 231440153.0,
+      "reward": 0.375,
+      "reward_std": 0.3451131582260132,
+      "rewards/accuracy_reward/mean": 0.375,
+      "rewards/accuracy_reward/std": 0.4860251843929291,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999503493309021,
+      "sampling/importance_sampling_ratio/min": 0.019039930775761604,
+      "sampling/sampling_logp_difference/max": 3.961216926574707,
+      "sampling/sampling_logp_difference/mean": 0.021084938198328018,
+      "step": 286
+    },
+    {
+      "clip_ratio/high_max": 1.9223051822336856e-05,
+      "clip_ratio/high_mean": 6.997284344834043e-06,
+      "clip_ratio/low_mean": 5.4512621773028513e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 6.150990611786256e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14304.0,
+      "completions/mean_length": 5858.40625,
+      "completions/mean_terminated_length": 5691.33349609375,
+      "completions/min_length": 546.0,
+      "completions/min_terminated_length": 546.0,
+      "entropy": 0.8120778575539589,
+      "epoch": 0.2640294388224471,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.002288782736286521,
+      "learning_rate": 1e-05,
+      "loss": 0.0408,
+      "num_tokens": 232209485.0,
+      "reward": 0.46875,
+      "reward_std": 0.36637401580810547,
+      "rewards/accuracy_reward/mean": 0.46875,
+      "rewards/accuracy_reward/std": 0.5009832978248596,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999239444732666,
+      "sampling/importance_sampling_ratio/min": 0.00017959839897230268,
+      "sampling/sampling_logp_difference/max": 8.624787330627441,
+      "sampling/sampling_logp_difference/mean": 0.019076552242040634,
+      "step": 287
+    },
+    {
+      "clip_ratio/high_max": 9.900939403451048e-06,
+      "clip_ratio/high_mean": 3.4680233511608094e-06,
+      "clip_ratio/low_mean": 1.8137742017643177e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.1605765368803986e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15825.0,
+      "completions/mean_length": 7088.4765625,
+      "completions/mean_terminated_length": 6710.609375,
+      "completions/min_length": 688.0,
+      "completions/min_terminated_length": 688.0,
+      "entropy": 0.9231890514492989,
+      "epoch": 0.26494940202391903,
+      "frac_reward_zero_std": 0.625,
+      "grad_norm": 0.001075367210432887,
+      "learning_rate": 1e-05,
+      "loss": 0.0364,
+      "num_tokens": 233133850.0,
+      "reward": 0.5078125,
+      "reward_std": 0.18383610248565674,
+      "rewards/accuracy_reward/mean": 0.5078125,
+      "rewards/accuracy_reward/std": 0.5019033551216125,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998681545257568,
+      "sampling/importance_sampling_ratio/min": 0.005257915705442429,
+      "sampling/sampling_logp_difference/max": 5.248020648956299,
+      "sampling/sampling_logp_difference/mean": 0.019140273332595825,
+      "step": 288
+    },
+    {
+      "clip_ratio/high_max": 8.648456969240215e-06,
+      "clip_ratio/high_mean": 2.1621142423100537e-06,
+      "clip_ratio/low_mean": 1.838804723774956e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.0550161480059614e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16384.0,
+      "completions/mean_length": 6151.78125,
+      "completions/mean_terminated_length": 5906.20849609375,
+      "completions/min_length": 772.0,
+      "completions/min_terminated_length": 772.0,
+      "entropy": 0.8585417941212654,
+      "epoch": 0.265869365225391,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0015517349820584059,
+      "learning_rate": 1e-05,
+      "loss": 0.0828,
+      "num_tokens": 233940718.0,
+      "reward": 0.46875,
+      "reward_std": 0.21884137392044067,
+      "rewards/accuracy_reward/mean": 0.46875,
+      "rewards/accuracy_reward/std": 0.5009832978248596,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000255107879639,
+      "sampling/importance_sampling_ratio/min": 7.617311348440126e-05,
+      "sampling/sampling_logp_difference/max": 9.482501983642578,
+      "sampling/sampling_logp_difference/mean": 0.019276250153779984,
+      "step": 289
+    },
+    {
+      "clip_ratio/high_max": 1.1416668485253467e-05,
+      "clip_ratio/high_mean": 3.7661499732166703e-06,
+      "clip_ratio/low_mean": 2.1342358195397537e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.5108507770710276e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15133.0,
+      "completions/mean_length": 7111.2578125,
+      "completions/mean_terminated_length": 6812.13671875,
+      "completions/min_length": 35.0,
+      "completions/min_terminated_length": 35.0,
+      "entropy": 0.9735362678766251,
+      "epoch": 0.2667893284268629,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0036829947493970394,
+      "learning_rate": 1e-05,
+      "loss": 0.0642,
+      "num_tokens": 234872111.0,
+      "reward": 0.4296875,
+      "reward_std": 0.31930169463157654,
+      "rewards/accuracy_reward/mean": 0.4296875,
+      "rewards/accuracy_reward/std": 0.4969765841960907,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999943971633911,
+      "sampling/importance_sampling_ratio/min": 0.0006535807042382658,
+      "sampling/sampling_logp_difference/max": 7.333044528961182,
+      "sampling/sampling_logp_difference/mean": 0.021356046199798584,
+      "step": 290
+    },
+    {
+      "clip_ratio/high_max": 2.2526005068357335e-05,
+      "clip_ratio/high_mean": 5.631501267089334e-06,
+      "clip_ratio/low_mean": 3.30086276107977e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.864012808207917e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15995.0,
+      "completions/mean_length": 6787.671875,
+      "completions/mean_terminated_length": 6478.11279296875,
+      "completions/min_length": 1404.0,
+      "completions/min_terminated_length": 1404.0,
+      "entropy": 0.8856986835598946,
+      "epoch": 0.26770929162833484,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.00234629912301898,
+      "learning_rate": 1e-05,
+      "loss": 0.0169,
+      "num_tokens": 235759149.0,
+      "reward": 0.5390625,
+      "reward_std": 0.25354230403900146,
+      "rewards/accuracy_reward/mean": 0.5390625,
+      "rewards/accuracy_reward/std": 0.5004304051399231,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999296069145203,
+      "sampling/importance_sampling_ratio/min": 0.00035710117663256824,
+      "sampling/sampling_logp_difference/max": 7.937491416931152,
+      "sampling/sampling_logp_difference/mean": 0.01950475014746189,
+      "step": 291
+    },
+    {
+      "clip_ratio/high_max": 2.6025282068076194e-05,
+      "clip_ratio/high_mean": 6.5063205170190486e-06,
+      "clip_ratio/low_mean": 4.603358706845029e-05,
+      "clip_ratio/low_min": 4.53654638477019e-06,
+      "clip_ratio/region_mean": 5.253990843812062e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15845.0,
+      "completions/mean_length": 6757.203125,
+      "completions/mean_terminated_length": 6604.39697265625,
+      "completions/min_length": 236.0,
+      "completions/min_terminated_length": 236.0,
+      "entropy": 0.9217840805649757,
+      "epoch": 0.2686292548298068,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0034125701058655977,
+      "learning_rate": 1e-05,
+      "loss": 0.0527,
+      "num_tokens": 236643319.0,
+      "reward": 0.3515625,
+      "reward_std": 0.2896084189414978,
+      "rewards/accuracy_reward/mean": 0.3515625,
+      "rewards/accuracy_reward/std": 0.4793342351913452,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999240636825562,
+      "sampling/importance_sampling_ratio/min": 6.144329745438881e-06,
+      "sampling/sampling_logp_difference/max": 11.999980926513672,
+      "sampling/sampling_logp_difference/mean": 0.020774487406015396,
+      "step": 292
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 3.5210429246035346e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.5210429246035346e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16319.0,
+      "completions/mean_length": 6504.4375,
+      "completions/mean_terminated_length": 6185.74169921875,
+      "completions/min_length": 516.0,
+      "completions/min_terminated_length": 516.0,
+      "entropy": 1.126970261335373,
+      "epoch": 0.26954921803127874,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0020905097480863333,
+      "learning_rate": 1e-05,
+      "loss": 0.0464,
+      "num_tokens": 237495351.0,
+      "reward": 0.25,
+      "reward_std": 0.30904704332351685,
+      "rewards/accuracy_reward/mean": 0.25,
+      "rewards/accuracy_reward/std": 0.434714138507843,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000053644180298,
+      "sampling/importance_sampling_ratio/min": 0.0009940610034391284,
+      "sampling/sampling_logp_difference/max": 6.913712024688721,
+      "sampling/sampling_logp_difference/mean": 0.023218728601932526,
+      "step": 293
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 1.5693222053414502e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 1.5693222053414502e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15888.0,
+      "completions/mean_length": 5702.4140625,
+      "completions/mean_terminated_length": 5446.05615234375,
+      "completions/min_length": 867.0,
+      "completions/min_terminated_length": 867.0,
+      "entropy": 0.8803137242794037,
+      "epoch": 0.2704691812327507,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.002073790645226836,
+      "learning_rate": 1e-05,
+      "loss": 0.0066,
+      "num_tokens": 238251852.0,
+      "reward": 0.5625,
+      "reward_std": 0.2022808939218521,
+      "rewards/accuracy_reward/mean": 0.5625,
+      "rewards/accuracy_reward/std": 0.49802759289741516,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.000054955482483,
+      "sampling/importance_sampling_ratio/min": 0.016290459781885147,
+      "sampling/sampling_logp_difference/max": 4.117175579071045,
+      "sampling/sampling_logp_difference/mean": 0.0185186006128788,
+      "step": 294
+    },
+    {
+      "clip_ratio/high_max": 1.4213665508577833e-05,
+      "clip_ratio/high_mean": 4.4483959982244414e-06,
+      "clip_ratio/low_mean": 2.979715202400257e-05,
+      "clip_ratio/low_min": 4.1597336348786484e-06,
+      "clip_ratio/region_mean": 3.424554824960069e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15915.0,
+      "completions/mean_length": 7176.2890625,
+      "completions/mean_terminated_length": 6801.99169921875,
+      "completions/min_length": 654.0,
+      "completions/min_terminated_length": 654.0,
+      "entropy": 0.9554997384548187,
+      "epoch": 0.27138914443422263,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.002781527815386653,
+      "learning_rate": 1e-05,
+      "loss": 0.0908,
+      "num_tokens": 239189385.0,
+      "reward": 0.5078125,
+      "reward_std": 0.3634958863258362,
+      "rewards/accuracy_reward/mean": 0.5078125,
+      "rewards/accuracy_reward/std": 0.5019033551216125,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999794960021973,
+      "sampling/importance_sampling_ratio/min": 0.0018711343873292208,
+      "sampling/sampling_logp_difference/max": 6.281210422515869,
+      "sampling/sampling_logp_difference/mean": 0.020436719059944153,
+      "step": 295
+    },
+    {
+      "clip_ratio/high_max": 1.2612186310434481e-05,
+      "clip_ratio/high_mean": 5.171368570699997e-06,
+      "clip_ratio/low_mean": 4.8968343890010146e-05,
+      "clip_ratio/low_min": 4.0222671486844774e-06,
+      "clip_ratio/region_mean": 5.413971166490228e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16020.0,
+      "completions/mean_length": 7855.578125,
+      "completions/mean_terminated_length": 7651.2001953125,
+      "completions/min_length": 688.0,
+      "completions/min_terminated_length": 688.0,
+      "entropy": 0.9450526610016823,
+      "epoch": 0.27230910763569455,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.003132987068966031,
+      "learning_rate": 1e-05,
+      "loss": 0.0311,
+      "num_tokens": 240217715.0,
+      "reward": 0.40625,
+      "reward_std": 0.28512775897979736,
+      "rewards/accuracy_reward/mean": 0.40625,
+      "rewards/accuracy_reward/std": 0.4930621087551117,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999253153800964,
+      "sampling/importance_sampling_ratio/min": 0.0011438478250056505,
+      "sampling/sampling_logp_difference/max": 6.773357391357422,
+      "sampling/sampling_logp_difference/mean": 0.021461743861436844,
+      "step": 296
+    },
+    {
+      "clip_ratio/high_max": 2.172341964978841e-05,
+      "clip_ratio/high_mean": 6.823271291978017e-06,
+      "clip_ratio/low_mean": 3.516899266742257e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.199226441414794e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14060.0,
+      "completions/mean_length": 6240.265625,
+      "completions/mean_terminated_length": 5913.04833984375,
+      "completions/min_length": 421.0,
+      "completions/min_terminated_length": 421.0,
+      "entropy": 0.8811023011803627,
+      "epoch": 0.2732290708371665,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0028523094952106476,
+      "learning_rate": 1e-05,
+      "loss": 0.015,
+      "num_tokens": 241035133.0,
+      "reward": 0.484375,
+      "reward_std": 0.26143303513526917,
+      "rewards/accuracy_reward/mean": 0.484375,
+      "rewards/accuracy_reward/std": 0.5017194747924805,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.000007152557373,
+      "sampling/importance_sampling_ratio/min": 0.0006931954412721097,
+      "sampling/sampling_logp_difference/max": 7.274198532104492,
+      "sampling/sampling_logp_difference/mean": 0.019493088126182556,
+      "step": 297
+    },
+    {
+      "clip_ratio/high_max": 1.2606601558218244e-05,
+      "clip_ratio/high_mean": 3.151650389554561e-06,
+      "clip_ratio/low_mean": 3.768150395444536e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.08331545713736e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15795.0,
+      "completions/mean_length": 6103.203125,
+      "completions/mean_terminated_length": 6022.251953125,
+      "completions/min_length": 355.0,
+      "completions/min_terminated_length": 355.0,
+      "entropy": 0.8766692876815796,
+      "epoch": 0.27414903403863844,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0026241440791636705,
+      "learning_rate": 1e-05,
+      "loss": 0.0089,
+      "num_tokens": 241836479.0,
+      "reward": 0.453125,
+      "reward_std": 0.32589423656463623,
+      "rewards/accuracy_reward/mean": 0.453125,
+      "rewards/accuracy_reward/std": 0.4997538626194,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999925434589386,
+      "sampling/importance_sampling_ratio/min": 0.00012664205860346556,
+      "sampling/sampling_logp_difference/max": 8.974145889282227,
+      "sampling/sampling_logp_difference/mean": 0.01907728984951973,
+      "step": 298
+    },
+    {
+      "clip_ratio/high_max": 1.7400974911652156e-05,
+      "clip_ratio/high_mean": 4.350243727913039e-06,
+      "clip_ratio/low_mean": 4.527119426711579e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.962143839293276e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16169.0,
+      "completions/mean_length": 7711.0703125,
+      "completions/mean_terminated_length": 7573.4052734375,
+      "completions/min_length": 290.0,
+      "completions/min_terminated_length": 290.0,
+      "entropy": 1.0770929008722305,
+      "epoch": 0.2750689972401104,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.003654222236946225,
+      "learning_rate": 1e-05,
+      "loss": 0.0443,
+      "num_tokens": 242844376.0,
+      "reward": 0.3359375,
+      "reward_std": 0.2501322627067566,
+      "rewards/accuracy_reward/mean": 0.3359375,
+      "rewards/accuracy_reward/std": 0.47417303919792175,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999839067459106,
+      "sampling/importance_sampling_ratio/min": 0.0006267472635954618,
+      "sampling/sampling_logp_difference/max": 7.374967098236084,
+      "sampling/sampling_logp_difference/mean": 0.022012868896126747,
+      "step": 299
+    },
+    {
+      "clip_ratio/high_max": 1.4325163647299632e-05,
+      "clip_ratio/high_mean": 3.581290911824908e-06,
+      "clip_ratio/low_mean": 4.28195745598714e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.6400865016948956e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15905.0,
+      "completions/mean_length": 6616.5546875,
+      "completions/mean_terminated_length": 6539.6455078125,
+      "completions/min_length": 138.0,
+      "completions/min_terminated_length": 138.0,
+      "entropy": 0.8439916148781776,
+      "epoch": 0.27598896044158233,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0029195898678153753,
+      "learning_rate": 1e-05,
+      "loss": 0.1094,
+      "num_tokens": 243708479.0,
+      "reward": 0.453125,
+      "reward_std": 0.3516485095024109,
+      "rewards/accuracy_reward/mean": 0.453125,
+      "rewards/accuracy_reward/std": 0.4997538626194,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998997449874878,
+      "sampling/importance_sampling_ratio/min": 2.189194128732197e-05,
+      "sampling/sampling_logp_difference/max": 10.729392051696777,
+      "sampling/sampling_logp_difference/mean": 0.017992788925766945,
+      "step": 300
+    },
+    {
+      "clip_ratio/high_max": 1.848296233220026e-05,
+      "clip_ratio/high_mean": 4.620740583050065e-06,
+      "clip_ratio/low_mean": 5.01860952226707e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.480683557834709e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15286.0,
+      "completions/mean_length": 6173.5234375,
+      "completions/mean_terminated_length": 6093.1259765625,
+      "completions/min_length": 21.0,
+      "completions/min_terminated_length": 21.0,
+      "entropy": 0.8975192531943321,
+      "epoch": 0.2769089236430543,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.0017261393368244171,
+      "learning_rate": 1e-05,
+      "loss": 0.0853,
+      "num_tokens": 244515378.0,
+      "reward": 0.53125,
+      "reward_std": 0.3532412052154541,
+      "rewards/accuracy_reward/mean": 0.53125,
+      "rewards/accuracy_reward/std": 0.5009832978248596,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999533891677856,
+      "sampling/importance_sampling_ratio/min": 0.000553854217287153,
+      "sampling/sampling_logp_difference/max": 7.4986090660095215,
+      "sampling/sampling_logp_difference/mean": 0.019458644092082977,
+      "step": 301
+    },
+    {
+      "clip_ratio/high_max": 4.114005332667148e-05,
+      "clip_ratio/high_mean": 1.2276760230633954e-05,
+      "clip_ratio/low_mean": 3.397437080820964e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.625113024303573e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16032.0,
+      "completions/mean_length": 5640.90625,
+      "completions/mean_terminated_length": 5470.38134765625,
+      "completions/min_length": 359.0,
+      "completions/min_terminated_length": 359.0,
+      "entropy": 0.8833519890904427,
+      "epoch": 0.2778288868445262,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0018768958980217576,
+      "learning_rate": 1e-05,
+      "loss": 0.0731,
+      "num_tokens": 245258318.0,
+      "reward": 0.4609375,
+      "reward_std": 0.3135277330875397,
+      "rewards/accuracy_reward/mean": 0.4609375,
+      "rewards/accuracy_reward/std": 0.5004304051399231,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999516606330872,
+      "sampling/importance_sampling_ratio/min": 0.0071789538487792015,
+      "sampling/sampling_logp_difference/max": 4.936601638793945,
+      "sampling/sampling_logp_difference/mean": 0.019646335393190384,
+      "step": 302
+    },
+    {
+      "clip_ratio/high_max": 1.4196921938491869e-05,
+      "clip_ratio/high_mean": 4.514302474944998e-06,
+      "clip_ratio/low_mean": 4.4677519781544106e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.919182129015098e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16378.0,
+      "completions/mean_length": 7840.5078125,
+      "completions/mean_terminated_length": 7564.9111328125,
+      "completions/min_length": 758.0,
+      "completions/min_terminated_length": 758.0,
+      "entropy": 0.9772802665829659,
+      "epoch": 0.27874885004599814,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.002617602702230215,
+      "learning_rate": 1e-05,
+      "loss": 0.0298,
+      "num_tokens": 246280663.0,
+      "reward": 0.328125,
+      "reward_std": 0.29826050996780396,
+      "rewards/accuracy_reward/mean": 0.328125,
+      "rewards/accuracy_reward/std": 0.4713755249977112,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999324083328247,
+      "sampling/importance_sampling_ratio/min": 0.0008982301224023104,
+      "sampling/sampling_logp_difference/max": 7.015084266662598,
+      "sampling/sampling_logp_difference/mean": 0.022171074524521828,
+      "step": 303
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 1.7621316146687604e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 1.7621316146687604e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16326.0,
+      "completions/mean_length": 6316.1015625,
+      "completions/mean_terminated_length": 6074.47216796875,
+      "completions/min_length": 779.0,
+      "completions/min_terminated_length": 779.0,
+      "entropy": 0.8542795851826668,
+      "epoch": 0.2796688132474701,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0011874900665134192,
+      "learning_rate": 1e-05,
+      "loss": 0.0513,
+      "num_tokens": 247107604.0,
+      "reward": 0.3828125,
+      "reward_std": 0.2227931022644043,
+      "rewards/accuracy_reward/mean": 0.3828125,
+      "rewards/accuracy_reward/std": 0.4879830479621887,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000126361846924,
+      "sampling/importance_sampling_ratio/min": 0.00015846268797758967,
+      "sampling/sampling_logp_difference/max": 8.749991416931152,
+      "sampling/sampling_logp_difference/mean": 0.018691308796405792,
+      "step": 304
+    },
+    {
+      "clip_ratio/high_max": 3.0959752166381804e-06,
+      "clip_ratio/high_mean": 7.739938041595451e-07,
+      "clip_ratio/low_mean": 6.0967123090449604e-05,
+      "clip_ratio/low_min": 2.711407751121442e-05,
+      "clip_ratio/region_mean": 6.17411176335736e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15782.0,
+      "completions/mean_length": 6568.171875,
+      "completions/mean_terminated_length": 6412.365234375,
+      "completions/min_length": 97.0,
+      "completions/min_terminated_length": 97.0,
+      "entropy": 0.9063890501856804,
+      "epoch": 0.28058877644894203,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.002459619427099824,
+      "learning_rate": 1e-05,
+      "loss": 0.0725,
+      "num_tokens": 247967322.0,
+      "reward": 0.5,
+      "reward_std": 0.3214184641838074,
+      "rewards/accuracy_reward/mean": 0.5,
+      "rewards/accuracy_reward/std": 0.5019646286964417,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998743534088135,
+      "sampling/importance_sampling_ratio/min": 0.012350871227681637,
+      "sampling/sampling_logp_difference/max": 4.394028663635254,
+      "sampling/sampling_logp_difference/mean": 0.020134467631578445,
+      "step": 305
+    },
+    {
+      "clip_ratio/high_max": 5.9507838159333915e-06,
+      "clip_ratio/high_mean": 1.4876959539833479e-06,
+      "clip_ratio/low_mean": 2.400908408617397e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.549678004015732e-05,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15714.0,
+      "completions/mean_length": 8182.28125,
+      "completions/mean_terminated_length": 7635.50048828125,
+      "completions/min_length": 877.0,
+      "completions/min_terminated_length": 877.0,
+      "entropy": 1.0137704983353615,
+      "epoch": 0.281508739650414,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0016673406353220344,
+      "learning_rate": 1e-05,
+      "loss": 0.0244,
+      "num_tokens": 249031710.0,
+      "reward": 0.3359375,
+      "reward_std": 0.22225631773471832,
+      "rewards/accuracy_reward/mean": 0.3359375,
+      "rewards/accuracy_reward/std": 0.47417303919792175,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998995065689087,
+      "sampling/importance_sampling_ratio/min": 0.0008049134048633277,
+      "sampling/sampling_logp_difference/max": 7.1247758865356445,
+      "sampling/sampling_logp_difference/mean": 0.021704845130443573,
+      "step": 306
+    },
+    {
+      "clip_ratio/high_max": 1.4527202438330278e-05,
+      "clip_ratio/high_mean": 3.6318006095825695e-06,
+      "clip_ratio/low_mean": 3.1829216595724574e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.5461017205307144e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 14502.0,
+      "completions/max_terminated_length": 14502.0,
+      "completions/mean_length": 6460.5703125,
+      "completions/mean_terminated_length": 6460.5703125,
+      "completions/min_length": 804.0,
+      "completions/min_terminated_length": 804.0,
+      "entropy": 1.0418165400624275,
+      "epoch": 0.2824287028518859,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0022682021372020245,
+      "learning_rate": 1e-05,
+      "loss": 0.0171,
+      "num_tokens": 249881047.0,
+      "reward": 0.359375,
+      "reward_std": 0.25566887855529785,
+      "rewards/accuracy_reward/mean": 0.359375,
+      "rewards/accuracy_reward/std": 0.481702595949173,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999744296073914,
+      "sampling/importance_sampling_ratio/min": 0.002809183904901147,
+      "sampling/sampling_logp_difference/max": 5.874861240386963,
+      "sampling/sampling_logp_difference/mean": 0.02204791083931923,
+      "step": 307
+    },
+    {
+      "clip_ratio/high_max": 9.222687367582694e-06,
+      "clip_ratio/high_mean": 4.125313353142701e-06,
+      "clip_ratio/low_mean": 4.836107154915226e-05,
+      "clip_ratio/low_min": 3.4611657611094415e-06,
+      "clip_ratio/region_mean": 5.248638444754761e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14754.0,
+      "completions/mean_length": 6846.3046875,
+      "completions/mean_terminated_length": 6694.9130859375,
+      "completions/min_length": 944.0,
+      "completions/min_terminated_length": 944.0,
+      "entropy": 0.9839218333363533,
+      "epoch": 0.28334866605335784,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.002436346374452114,
+      "learning_rate": 1e-05,
+      "loss": 0.0637,
+      "num_tokens": 250773806.0,
+      "reward": 0.484375,
+      "reward_std": 0.34299150109291077,
+      "rewards/accuracy_reward/mean": 0.484375,
+      "rewards/accuracy_reward/std": 0.5017194747924805,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999980628490448,
+      "sampling/importance_sampling_ratio/min": 0.0257408544421196,
+      "sampling/sampling_logp_difference/max": 3.6596758365631104,
+      "sampling/sampling_logp_difference/mean": 0.02135510742664337,
+      "step": 308
+    },
+    {
+      "clip_ratio/high_max": 1.3327621218195418e-05,
+      "clip_ratio/high_mean": 3.3319053045488545e-06,
+      "clip_ratio/low_mean": 3.791964286392613e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.1251548054788145e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15777.0,
+      "completions/mean_length": 6558.53125,
+      "completions/mean_terminated_length": 6241.58056640625,
+      "completions/min_length": 884.0,
+      "completions/min_terminated_length": 884.0,
+      "entropy": 0.7833076938986778,
+      "epoch": 0.2842686292548298,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.002015948062762618,
+      "learning_rate": 1e-05,
+      "loss": 0.0791,
+      "num_tokens": 251633074.0,
+      "reward": 0.46875,
+      "reward_std": 0.27328526973724365,
+      "rewards/accuracy_reward/mean": 0.46875,
+      "rewards/accuracy_reward/std": 0.5009832978248596,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999434947967529,
+      "sampling/importance_sampling_ratio/min": 5.1445105782477185e-05,
+      "sampling/sampling_logp_difference/max": 9.874995231628418,
+      "sampling/sampling_logp_difference/mean": 0.017078280448913574,
+      "step": 309
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 3.3865982686620555e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.3865982686620555e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16162.0,
+      "completions/mean_length": 7626.390625,
+      "completions/mean_terminated_length": 7487.38134765625,
+      "completions/min_length": 1400.0,
+      "completions/min_terminated_length": 1400.0,
+      "entropy": 0.8946382254362106,
+      "epoch": 0.28518859245630174,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.001098336186259985,
+      "learning_rate": 1e-05,
+      "loss": 0.042,
+      "num_tokens": 252629300.0,
+      "reward": 0.3359375,
+      "reward_std": 0.27222445607185364,
+      "rewards/accuracy_reward/mean": 0.3359375,
+      "rewards/accuracy_reward/std": 0.47417303919792175,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000107288360596,
+      "sampling/importance_sampling_ratio/min": 0.00021643216314259917,
+      "sampling/sampling_logp_difference/max": 8.438233375549316,
+      "sampling/sampling_logp_difference/mean": 0.01972624473273754,
+      "step": 310
+    },
+    {
+      "clip_ratio/high_max": 6.5777783220255515e-06,
+      "clip_ratio/high_mean": 1.6444445805063879e-06,
+      "clip_ratio/low_mean": 1.7658890669736138e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 1.9303335250242526e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15763.0,
+      "completions/mean_length": 5796.984375,
+      "completions/mean_terminated_length": 5713.6220703125,
+      "completions/min_length": 528.0,
+      "completions/min_terminated_length": 528.0,
+      "entropy": 0.969724528491497,
+      "epoch": 0.2861085556577737,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.003871417138725519,
+      "learning_rate": 1e-05,
+      "loss": 0.0408,
+      "num_tokens": 253389562.0,
+      "reward": 0.484375,
+      "reward_std": 0.23752351105213165,
+      "rewards/accuracy_reward/mean": 0.484375,
+      "rewards/accuracy_reward/std": 0.5017194747924805,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998880624771118,
+      "sampling/importance_sampling_ratio/min": 2.4301782104885206e-05,
+      "sampling/sampling_logp_difference/max": 10.624960899353027,
+      "sampling/sampling_logp_difference/mean": 0.019220752641558647,
+      "step": 311
+    },
+    {
+      "clip_ratio/high_max": 8.099077376755304e-06,
+      "clip_ratio/high_mean": 2.8300572125772305e-06,
+      "clip_ratio/low_mean": 3.2033483023496956e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.486354006554393e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15769.0,
+      "completions/mean_length": 6938.5625,
+      "completions/mean_terminated_length": 6788.63525390625,
+      "completions/min_length": 769.0,
+      "completions/min_terminated_length": 769.0,
+      "entropy": 0.9812447279691696,
+      "epoch": 0.28702851885924563,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.002257548039779067,
+      "learning_rate": 1e-05,
+      "loss": -0.0089,
+      "num_tokens": 254295858.0,
+      "reward": 0.4140625,
+      "reward_std": 0.2596206068992615,
+      "rewards/accuracy_reward/mean": 0.4140625,
+      "rewards/accuracy_reward/std": 0.49449479579925537,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000464916229248,
+      "sampling/importance_sampling_ratio/min": 0.0009388317703269422,
+      "sampling/sampling_logp_difference/max": 6.970874309539795,
+      "sampling/sampling_logp_difference/mean": 0.02080199122428894,
+      "step": 312
+    },
+    {
+      "clip_ratio/high_max": 4.441917553776875e-06,
+      "clip_ratio/high_mean": 1.1104793884442188e-06,
+      "clip_ratio/low_mean": 3.414505465570983e-05,
+      "clip_ratio/low_min": 3.790060873143375e-06,
+      "clip_ratio/region_mean": 3.5255534044154047e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15163.0,
+      "completions/mean_length": 6878.15625,
+      "completions/mean_terminated_length": 6650.01611328125,
+      "completions/min_length": 302.0,
+      "completions/min_terminated_length": 302.0,
+      "entropy": 0.9106859937310219,
+      "epoch": 0.28794848206071755,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.00420041661709547,
+      "learning_rate": 1e-05,
+      "loss": 0.0179,
+      "num_tokens": 255197110.0,
+      "reward": 0.421875,
+      "reward_std": 0.30433881282806396,
+      "rewards/accuracy_reward/mean": 0.421875,
+      "rewards/accuracy_reward/std": 0.4957992732524872,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999925494194031,
+      "sampling/importance_sampling_ratio/min": 0.015217061154544353,
+      "sampling/sampling_logp_difference/max": 4.185338020324707,
+      "sampling/sampling_logp_difference/mean": 0.02016574889421463,
+      "step": 313
+    },
+    {
+      "clip_ratio/high_max": 8.814751254249131e-06,
+      "clip_ratio/high_mean": 2.203687813562283e-06,
+      "clip_ratio/low_mean": 3.137724206681014e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.3580929766685585e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 14462.0,
+      "completions/max_terminated_length": 14462.0,
+      "completions/mean_length": 6260.2578125,
+      "completions/mean_terminated_length": 6260.2578125,
+      "completions/min_length": 790.0,
+      "completions/min_terminated_length": 790.0,
+      "entropy": 0.9523455575108528,
+      "epoch": 0.2888684452621895,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0027907798066735268,
+      "learning_rate": 1e-05,
+      "loss": 0.0302,
+      "num_tokens": 256018935.0,
+      "reward": 0.421875,
+      "reward_std": 0.2659186124801636,
+      "rewards/accuracy_reward/mean": 0.421875,
+      "rewards/accuracy_reward/std": 0.4957992732524872,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000364780426025,
+      "sampling/importance_sampling_ratio/min": 7.485197420464829e-05,
+      "sampling/sampling_logp_difference/max": 9.499998092651367,
+      "sampling/sampling_logp_difference/mean": 0.0191945917904377,
+      "step": 314
+    },
+    {
+      "clip_ratio/high_max": 2.8685263259831117e-05,
+      "clip_ratio/high_mean": 7.171315814957779e-06,
+      "clip_ratio/low_mean": 2.780131131885355e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.497262770224552e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16124.0,
+      "completions/mean_length": 6202.828125,
+      "completions/mean_terminated_length": 6041.22265625,
+      "completions/min_length": 453.0,
+      "completions/min_terminated_length": 453.0,
+      "entropy": 0.8513326346874237,
+      "epoch": 0.28978840846366144,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0023744129575788975,
+      "learning_rate": 1e-05,
+      "loss": 0.0379,
+      "num_tokens": 256841129.0,
+      "reward": 0.5625,
+      "reward_std": 0.32407689094543457,
+      "rewards/accuracy_reward/mean": 0.5625,
+      "rewards/accuracy_reward/std": 0.49802759289741516,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000146627426147,
+      "sampling/importance_sampling_ratio/min": 9.269781003240496e-06,
+      "sampling/sampling_logp_difference/max": 11.588750839233398,
+      "sampling/sampling_logp_difference/mean": 0.019519174471497536,
+      "step": 315
+    },
+    {
+      "clip_ratio/high_max": 1.6381697605538648e-05,
+      "clip_ratio/high_mean": 4.095424401384662e-06,
+      "clip_ratio/low_mean": 3.0394592840821133e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.449001792432682e-05,
+      "completions/clipped_ratio": 0.1015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16328.0,
+      "completions/mean_length": 8019.4609375,
+      "completions/mean_terminated_length": 7073.90380859375,
+      "completions/min_length": 262.0,
+      "completions/min_terminated_length": 262.0,
+      "entropy": 0.9211000874638557,
+      "epoch": 0.2907083716651334,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0024705040268599987,
+      "learning_rate": 1e-05,
+      "loss": 0.0433,
+      "num_tokens": 257884188.0,
+      "reward": 0.3046875,
+      "reward_std": 0.2869499623775482,
+      "rewards/accuracy_reward/mean": 0.3046875,
+      "rewards/accuracy_reward/std": 0.46208351850509644,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999315738677979,
+      "sampling/importance_sampling_ratio/min": 0.016358470544219017,
+      "sampling/sampling_logp_difference/max": 4.113009452819824,
+      "sampling/sampling_logp_difference/mean": 0.01984308287501335,
+      "step": 316
+    },
+    {
+      "clip_ratio/high_max": 7.485402420570608e-06,
+      "clip_ratio/high_mean": 1.871350605142652e-06,
+      "clip_ratio/low_mean": 3.025547425750119e-05,
+      "clip_ratio/low_min": 2.697337095014518e-06,
+      "clip_ratio/region_mean": 3.212682509001752e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15210.0,
+      "completions/mean_length": 7257.6875,
+      "completions/mean_terminated_length": 7038.65625,
+      "completions/min_length": 248.0,
+      "completions/min_terminated_length": 248.0,
+      "entropy": 0.8801277950406075,
+      "epoch": 0.29162833486660533,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0032848953269422054,
+      "learning_rate": 1e-05,
+      "loss": 0.0305,
+      "num_tokens": 258831852.0,
+      "reward": 0.4296875,
+      "reward_std": 0.31010788679122925,
+      "rewards/accuracy_reward/mean": 0.4296875,
+      "rewards/accuracy_reward/std": 0.4969765841960907,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998986124992371,
+      "sampling/importance_sampling_ratio/min": 0.00019848966621793807,
+      "sampling/sampling_logp_difference/max": 8.524773597717285,
+      "sampling/sampling_logp_difference/mean": 0.019743187353014946,
+      "step": 317
+    },
+    {
+      "clip_ratio/high_max": 1.52771035573096e-05,
+      "clip_ratio/high_mean": 3.8192758893274e-06,
+      "clip_ratio/low_mean": 3.605492440783564e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.987420052453672e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14903.0,
+      "completions/mean_length": 6042.84375,
+      "completions/mean_terminated_length": 5878.69873046875,
+      "completions/min_length": 251.0,
+      "completions/min_terminated_length": 251.0,
+      "entropy": 0.8792382404208183,
+      "epoch": 0.29254829806807725,
+      "frac_reward_zero_std": 0.125,
+      "grad_norm": 0.004201764706522226,
+      "learning_rate": 1e-05,
+      "loss": 0.099,
+      "num_tokens": 259623512.0,
+      "reward": 0.640625,
+      "reward_std": 0.3913668990135193,
+      "rewards/accuracy_reward/mean": 0.640625,
+      "rewards/accuracy_reward/std": 0.481702595949173,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998612403869629,
+      "sampling/importance_sampling_ratio/min": 0.00027811730979010463,
+      "sampling/sampling_logp_difference/max": 8.187467575073242,
+      "sampling/sampling_logp_difference/mean": 0.018901977688074112,
+      "step": 318
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 4.1642084397608414e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.1642084397608414e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16381.0,
+      "completions/mean_length": 7667.6875,
+      "completions/mean_terminated_length": 7458.49658203125,
+      "completions/min_length": 4.0,
+      "completions/min_terminated_length": 4.0,
+      "entropy": 0.9096411988139153,
+      "epoch": 0.2934682612695492,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0014557713875547051,
+      "learning_rate": 1e-05,
+      "loss": 0.0383,
+      "num_tokens": 260623928.0,
+      "reward": 0.3515625,
+      "reward_std": 0.22726887464523315,
+      "rewards/accuracy_reward/mean": 0.3515625,
+      "rewards/accuracy_reward/std": 0.4793342351913452,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999264478683472,
+      "sampling/importance_sampling_ratio/min": 0.0002615761768538505,
+      "sampling/sampling_logp_difference/max": 8.248785018920898,
+      "sampling/sampling_logp_difference/mean": 0.01979639381170273,
+      "step": 319
+    },
+    {
+      "clip_ratio/high_max": 2.36019068324822e-05,
+      "clip_ratio/high_mean": 5.90047670812055e-06,
+      "clip_ratio/low_mean": 2.704614530557592e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.2946622809504333e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15514.0,
+      "completions/max_terminated_length": 15514.0,
+      "completions/mean_length": 6428.8203125,
+      "completions/mean_terminated_length": 6428.8203125,
+      "completions/min_length": 617.0,
+      "completions/min_terminated_length": 617.0,
+      "entropy": 0.9974069148302078,
+      "epoch": 0.29438822447102114,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0028210312593728304,
+      "learning_rate": 1e-05,
+      "loss": 0.0589,
+      "num_tokens": 261465625.0,
+      "reward": 0.46875,
+      "reward_std": 0.3169426918029785,
+      "rewards/accuracy_reward/mean": 0.46875,
+      "rewards/accuracy_reward/std": 0.5009832978248596,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000195503234863,
+      "sampling/importance_sampling_ratio/min": 0.001225265790708363,
+      "sampling/sampling_logp_difference/max": 6.704597473144531,
+      "sampling/sampling_logp_difference/mean": 0.021066997200250626,
+      "step": 320
+    }
+  ],
+  "logging_steps": 1,
+  "max_steps": 1024,
+  "num_input_tokens_seen": 261465625,
+  "num_train_epochs": 1,
+  "save_steps": 64,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 0.0,
+  "train_batch_size": 4,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/dapo_lora_plus_20251202_001141/checkpoint-320/zero_to_fp32.py b/dapo_lora_plus_20251202_001141/checkpoint-320/zero_to_fp32.py
new file mode 100644
index 0000000000000000000000000000000000000000..5995d6e6f04e43b989587aa9022a3aef0c66d694
--- /dev/null
+++ b/dapo_lora_plus_20251202_001141/checkpoint-320/zero_to_fp32.py
@@ -0,0 +1,760 @@
+#!/usr/bin/env python
+
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets
+# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in
+# the future. Once extracted, the weights don't require DeepSpeed and can be used in any
+# application.
+#
+# example:
+#   python zero_to_fp32.py . output_dir/
+#   or
+#   python zero_to_fp32.py . output_dir/ --safe_serialization
+
+import argparse
+import torch
+import glob
+import math
+import os
+import re
+import gc
+import json
+import numpy as np
+from tqdm import tqdm
+from collections import OrderedDict
+from dataclasses import dataclass
+
+# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with
+# DeepSpeed data structures it has to be available in the current python environment.
+from deepspeed.utils import logger
+from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS,
+                                            FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES,
+                                            FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS)
+
+
+@dataclass
+class zero_model_state:
+    buffers: dict()
+    param_shapes: dict()
+    shared_params: list
+    ds_version: int
+    frozen_param_shapes: dict()
+    frozen_param_fragments: dict()
+
+
+debug = 0
+
+# load to cpu
+device = torch.device('cpu')
+
+
+def atoi(text):
+    return int(text) if text.isdigit() else text
+
+
+def natural_keys(text):
+    '''
+    alist.sort(key=natural_keys) sorts in human order
+    http://nedbatchelder.com/blog/200712/human_sorting.html
+    (See Toothy's implementation in the comments)
+    '''
+    return [atoi(c) for c in re.split(r'(\d+)', text)]
+
+
+def get_model_state_file(checkpoint_dir, zero_stage):
+    if not os.path.isdir(checkpoint_dir):
+        raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist")
+
+    # there should be only one file
+    if zero_stage <= 2:
+        file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt")
+    elif zero_stage == 3:
+        file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt")
+
+    if not os.path.exists(file):
+        raise FileNotFoundError(f"can't find model states file at '{file}'")
+
+    return file
+
+
+def get_checkpoint_files(checkpoint_dir, glob_pattern):
+    # XXX: need to test that this simple glob rule works for multi-node setup too
+    ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys)
+
+    if len(ckpt_files) == 0:
+        raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'")
+
+    return ckpt_files
+
+
+def get_optim_files(checkpoint_dir):
+    return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt")
+
+
+def get_model_state_files(checkpoint_dir):
+    return get_checkpoint_files(checkpoint_dir, "*_model_states.pt")
+
+
+def parse_model_states(files):
+    zero_model_states = []
+    for file in files:
+        state_dict = torch.load(file, map_location=device, weights_only=False)
+
+        if BUFFER_NAMES not in state_dict:
+            raise ValueError(f"{file} is not a model state checkpoint")
+        buffer_names = state_dict[BUFFER_NAMES]
+        if debug:
+            print("Found buffers:", buffer_names)
+
+        # recover just the buffers while restoring them to fp32 if they were saved in fp16
+        buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names}
+        param_shapes = state_dict[PARAM_SHAPES]
+
+        # collect parameters that are included in param_shapes
+        param_names = []
+        for s in param_shapes:
+            for name in s.keys():
+                param_names.append(name)
+
+        # update with frozen parameters
+        frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None)
+        if frozen_param_shapes is not None:
+            if debug:
+                print(f"Found frozen_param_shapes: {frozen_param_shapes}")
+            param_names += list(frozen_param_shapes.keys())
+
+        # handle shared params
+        shared_params = [[k, v] for k, v in state_dict["shared_params"].items()]
+
+        ds_version = state_dict.get(DS_VERSION, None)
+
+        frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None)
+
+        z_model_state = zero_model_state(buffers=buffers,
+                                         param_shapes=param_shapes,
+                                         shared_params=shared_params,
+                                         ds_version=ds_version,
+                                         frozen_param_shapes=frozen_param_shapes,
+                                         frozen_param_fragments=frozen_param_fragments)
+        zero_model_states.append(z_model_state)
+
+    return zero_model_states
+
+
+def parse_optim_states(files, ds_checkpoint_dir):
+    total_files = len(files)
+    state_dicts = []
+    for f in tqdm(files, desc='Loading checkpoint shards'):
+        state_dict = torch.load(f, map_location=device, mmap=True, weights_only=False)
+        # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights
+        # and also handle the case where it was already removed by another helper script
+        state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None)
+        state_dicts.append(state_dict)
+
+    if ZERO_STAGE not in state_dicts[0][OPTIMIZER_STATE_DICT]:
+        raise ValueError(f"{files[0]} is not a zero checkpoint")
+    zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE]
+    world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT]
+
+    # For ZeRO-2 each param group can have different partition_count as data parallelism for expert
+    # parameters can be different from data parallelism for non-expert parameters. So we can just
+    # use the max of the partition_count to get the dp world_size.
+
+    if type(world_size) is list:
+        world_size = max(world_size)
+
+    if world_size != total_files:
+        raise ValueError(
+            f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. "
+            "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes."
+        )
+
+    # the groups are named differently in each stage
+    if zero_stage <= 2:
+        fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS
+    elif zero_stage == 3:
+        fp32_groups_key = FP32_FLAT_GROUPS
+    else:
+        raise ValueError(f"unknown zero stage {zero_stage}")
+
+    fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))]
+    return zero_stage, world_size, fp32_flat_groups
+
+
+def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters):
+    """
+    Returns fp32 state_dict reconstructed from ds checkpoint
+
+    Args:
+        - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are)
+
+    """
+    print(f"Processing zero checkpoint '{ds_checkpoint_dir}'")
+
+    optim_files = get_optim_files(ds_checkpoint_dir)
+    zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir)
+    print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}")
+
+    model_files = get_model_state_files(ds_checkpoint_dir)
+
+    zero_model_states = parse_model_states(model_files)
+    print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}')
+
+    if zero_stage <= 2:
+        return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+                                                          exclude_frozen_parameters)
+    elif zero_stage == 3:
+        return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+                                                          exclude_frozen_parameters)
+
+
+def _zero2_merge_frozen_params(state_dict, zero_model_states):
+    if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0:
+        return
+
+    frozen_param_shapes = zero_model_states[0].frozen_param_shapes
+    frozen_param_fragments = zero_model_states[0].frozen_param_fragments
+
+    if debug:
+        num_elem = sum(s.numel() for s in frozen_param_shapes.values())
+        print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}')
+
+        wanted_params = len(frozen_param_shapes)
+        wanted_numel = sum(s.numel() for s in frozen_param_shapes.values())
+        avail_numel = sum([p.numel() for p in frozen_param_fragments.values()])
+        print(f'Frozen params: Have {avail_numel} numels to process.')
+        print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params')
+
+    total_params = 0
+    total_numel = 0
+    for name, shape in frozen_param_shapes.items():
+        total_params += 1
+        unpartitioned_numel = shape.numel()
+        total_numel += unpartitioned_numel
+
+        state_dict[name] = frozen_param_fragments[name]
+
+        if debug:
+            print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ")
+
+    print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements")
+
+
+def _has_callable(obj, fn):
+    attr = getattr(obj, fn, None)
+    return callable(attr)
+
+
+def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states):
+    param_shapes = zero_model_states[0].param_shapes
+
+    # Reconstruction protocol:
+    #
+    # XXX: document this
+
+    if debug:
+        for i in range(world_size):
+            for j in range(len(fp32_flat_groups[0])):
+                print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}")
+
+    # XXX: memory usage doubles here (zero2)
+    num_param_groups = len(fp32_flat_groups[0])
+    merged_single_partition_of_fp32_groups = []
+    for i in range(num_param_groups):
+        merged_partitions = [sd[i] for sd in fp32_flat_groups]
+        full_single_fp32_vector = torch.cat(merged_partitions, 0)
+        merged_single_partition_of_fp32_groups.append(full_single_fp32_vector)
+    avail_numel = sum(
+        [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups])
+
+    if debug:
+        wanted_params = sum([len(shapes) for shapes in param_shapes])
+        wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes])
+        # not asserting if there is a mismatch due to possible padding
+        print(f"Have {avail_numel} numels to process.")
+        print(f"Need {wanted_numel} numels in {wanted_params} params.")
+
+    # params
+    # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support
+    # out-of-core computing solution
+    total_numel = 0
+    total_params = 0
+    for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups):
+        offset = 0
+        avail_numel = full_single_fp32_vector.numel()
+        for name, shape in shapes.items():
+
+            unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape)
+            total_numel += unpartitioned_numel
+            total_params += 1
+
+            if debug:
+                print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ")
+            state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape)
+            offset += unpartitioned_numel
+
+        # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and
+        # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex
+        # paddings performed in the code it's almost impossible to predict the exact numbers w/o the
+        # live optimizer object, so we are checking that the numbers are within the right range
+        align_to = 2 * world_size
+
+        def zero2_align(x):
+            return align_to * math.ceil(x / align_to)
+
+        if debug:
+            print(f"original offset={offset}, avail_numel={avail_numel}")
+
+        offset = zero2_align(offset)
+        avail_numel = zero2_align(avail_numel)
+
+        if debug:
+            print(f"aligned  offset={offset}, avail_numel={avail_numel}")
+
+        # Sanity check
+        if offset != avail_numel:
+            raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong")
+
+    print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements")
+
+
+def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+                                               exclude_frozen_parameters):
+    state_dict = OrderedDict()
+
+    # buffers
+    buffers = zero_model_states[0].buffers
+    state_dict.update(buffers)
+    if debug:
+        print(f"added {len(buffers)} buffers")
+
+    if not exclude_frozen_parameters:
+        _zero2_merge_frozen_params(state_dict, zero_model_states)
+
+    _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states)
+
+    # recover shared parameters
+    for pair in zero_model_states[0].shared_params:
+        if pair[1] in state_dict:
+            state_dict[pair[0]] = state_dict[pair[1]]
+
+    return state_dict
+
+
+def zero3_partitioned_param_info(unpartitioned_numel, world_size):
+    remainder = unpartitioned_numel % world_size
+    padding_numel = (world_size - remainder) if remainder else 0
+    partitioned_numel = math.ceil(unpartitioned_numel / world_size)
+    return partitioned_numel, padding_numel
+
+
+def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states):
+    if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0:
+        return
+
+    if debug:
+        for i in range(world_size):
+            num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values())
+            print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}')
+
+        frozen_param_shapes = zero_model_states[0].frozen_param_shapes
+        wanted_params = len(frozen_param_shapes)
+        wanted_numel = sum(s.numel() for s in frozen_param_shapes.values())
+        avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size
+        print(f'Frozen params: Have {avail_numel} numels to process.')
+        print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params')
+
+    total_params = 0
+    total_numel = 0
+    for name, shape in zero_model_states[0].frozen_param_shapes.items():
+        total_params += 1
+        unpartitioned_numel = shape.numel()
+        total_numel += unpartitioned_numel
+
+        param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states)
+        state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape)
+
+        partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size)
+
+        if debug:
+            print(
+                f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}"
+            )
+
+    print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements")
+
+
+class GatheredTensor:
+    """
+    A pseudo tensor that collects partitioned weights.
+    It is more memory efficient when there are multiple groups.
+    """
+
+    def __init__(self, flat_groups, flat_groups_offset, offset, partitioned_numel, shape):
+        self.flat_groups = flat_groups
+        self.flat_groups_offset = flat_groups_offset
+        self.offset = offset
+        self.partitioned_numel = partitioned_numel
+        self.shape = shape
+        self.dtype = self.flat_groups[0][0].dtype
+
+    def contiguous(self):
+        """
+        Merge partitioned weights from flat_groups into a single tensor.
+        """
+        end_idx = self.offset + self.partitioned_numel
+        world_size = len(self.flat_groups)
+        pad_flat_param_chunks = []
+
+        for rank_i in range(world_size):
+            # for each rank, we need to collect weights from related group/groups
+            flat_groups_at_rank_i = self.flat_groups[rank_i]
+            start_group_id = None
+            end_group_id = None
+            for group_id in range(len(self.flat_groups_offset)):
+                if self.flat_groups_offset[group_id] <= self.offset < self.flat_groups_offset[group_id + 1]:
+                    start_group_id = group_id
+                if self.flat_groups_offset[group_id] < end_idx <= self.flat_groups_offset[group_id + 1]:
+                    end_group_id = group_id
+                    break
+            # collect weights from related group/groups
+            for group_id in range(start_group_id, end_group_id + 1):
+                flat_tensor = flat_groups_at_rank_i[group_id]
+                start_offset = self.offset - self.flat_groups_offset[group_id]
+                end_offset = min(end_idx, self.flat_groups_offset[group_id + 1]) - self.flat_groups_offset[group_id]
+                pad_flat_param_chunks.append(flat_tensor[start_offset:end_offset])
+
+        # collect weights from all ranks
+        pad_flat_param = torch.cat(pad_flat_param_chunks, dim=0)
+        param = pad_flat_param[:self.shape.numel()].view(self.shape).contiguous()
+        return param
+
+
+def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states):
+    param_shapes = zero_model_states[0].param_shapes
+    avail_numel = sum([flat_group.numel() for flat_group in fp32_flat_groups[0]]) * world_size
+
+    # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each
+    # param, re-consolidating each param, while dealing with padding if any
+
+    # merge list of dicts, preserving order
+    param_shapes = {k: v for d in param_shapes for k, v in d.items()}
+
+    if debug:
+        for i in range(world_size):
+            print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}")
+
+        wanted_params = len(param_shapes)
+        wanted_numel = sum(shape.numel() for shape in param_shapes.values())
+        # not asserting if there is a mismatch due to possible padding
+        avail_numel = fp32_flat_groups[0].numel() * world_size
+        print(f"Trainable params: Have {avail_numel} numels to process.")
+        print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.")
+
+    # params
+    # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support
+    # out-of-core computing solution
+    offset = 0
+    total_numel = 0
+    total_params = 0
+    flat_groups_offset = [0] + list(np.cumsum([flat_tensor.numel() for flat_tensor in fp32_flat_groups[0]]))
+    for name, shape in tqdm(param_shapes.items(), desc='Gathering sharded weights'):
+        unpartitioned_numel = shape.numel()
+        total_numel += unpartitioned_numel
+        total_params += 1
+        partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size)
+
+        if debug:
+            print(
+                f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}"
+            )
+
+        # memory efficient tensor
+        tensor = GatheredTensor(fp32_flat_groups, flat_groups_offset, offset, partitioned_numel, shape)
+        state_dict[name] = tensor
+        offset += partitioned_numel
+
+    offset *= world_size
+
+    # Sanity check
+    if offset != avail_numel:
+        raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong")
+
+    print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements")
+
+
+def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+                                               exclude_frozen_parameters):
+    state_dict = OrderedDict()
+
+    # buffers
+    buffers = zero_model_states[0].buffers
+    state_dict.update(buffers)
+    if debug:
+        print(f"added {len(buffers)} buffers")
+
+    if not exclude_frozen_parameters:
+        _zero3_merge_frozen_params(state_dict, world_size, zero_model_states)
+
+    _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states)
+
+    # recover shared parameters
+    for pair in zero_model_states[0].shared_params:
+        if pair[1] in state_dict:
+            state_dict[pair[0]] = state_dict[pair[1]]
+
+    return state_dict
+
+
+def to_torch_tensor(state_dict, return_empty_tensor=False):
+    """
+    Convert state_dict of GatheredTensor to torch tensor
+    """
+    torch_state_dict = {}
+    converted_tensors = {}
+    for name, tensor in state_dict.items():
+        tensor_id = id(tensor)
+        if tensor_id in converted_tensors:  # shared tensors
+            shared_tensor = torch_state_dict[converted_tensors[tensor_id]]
+            torch_state_dict[name] = shared_tensor
+        else:
+            converted_tensors[tensor_id] = name
+            if return_empty_tensor:
+                torch_state_dict[name] = torch.empty(tensor.shape, dtype=tensor.dtype)
+            else:
+                torch_state_dict[name] = tensor.contiguous()
+    return torch_state_dict
+
+
+def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir,
+                                             tag=None,
+                                             exclude_frozen_parameters=False,
+                                             lazy_mode=False):
+    """
+    Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with
+    ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example
+    via a model hub.
+
+    Args:
+        - ``checkpoint_dir``: path to the desired checkpoint folder
+        - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14``
+        - ``exclude_frozen_parameters``: exclude frozen parameters
+        - ``lazy_mode``: get state_dict in lazy mode. It returns a dict of pesduo tensor instead of torch tensor, which is more memory efficient.
+          Convert the pesduo tensor to torch tensor by ``.contiguous()``
+
+    Returns:
+        - pytorch ``state_dict``
+
+    A typical usage might be ::
+
+        from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint
+        # do the training and checkpoint saving
+        state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu
+        model = model.cpu() # move to cpu
+        model.load_state_dict(state_dict)
+        # submit to model hub or save the model to share with others
+
+    In this example the ``model`` will no longer be usable in the deepspeed context of the same
+    application. i.e. you will need to re-initialize the deepspeed engine, since
+    ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it.
+
+    If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead.
+
+    Note: the above usage may not work if your application doesn't have sufficient free CPU memory.
+    You may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with
+    the checkpoint. Or you can load state_dict in lazy mode ::
+
+        from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint
+        state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, lazy_mode=True) # not on cpu
+        for name, lazy_tensor in state_dict.item():
+            tensor = lazy_tensor.contiguous()  # to cpu
+            print(name, tensor)
+            # del tensor to release memory if it no longer in use
+    """
+    if tag is None:
+        latest_path = os.path.join(checkpoint_dir, 'latest')
+        if os.path.isfile(latest_path):
+            with open(latest_path, 'r') as fd:
+                tag = fd.read().strip()
+        else:
+            raise ValueError(f"Unable to find 'latest' file at {latest_path}")
+
+    ds_checkpoint_dir = os.path.join(checkpoint_dir, tag)
+
+    if not os.path.isdir(ds_checkpoint_dir):
+        raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist")
+
+    state_dict = _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters)
+    if lazy_mode:
+        return state_dict
+    else:
+        return to_torch_tensor(state_dict)
+
+
+def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir,
+                                               output_dir,
+                                               max_shard_size="5GB",
+                                               safe_serialization=False,
+                                               tag=None,
+                                               exclude_frozen_parameters=False):
+    """
+    Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be
+    loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed.
+
+    Args:
+        - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``)
+        - ``output_dir``: directory to the pytorch fp32 state_dict output files
+        - ``max_shard_size``: the maximum size for a checkpoint before being sharded, default value is 5GB
+        - ``safe_serialization``:  whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).
+        - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14``
+        - ``exclude_frozen_parameters``: exclude frozen parameters
+    """
+
+    # Dependency pre-check
+    if safe_serialization:
+        try:
+            from safetensors.torch import save_file
+        except ImportError:
+            print('If you want to use `safe_serialization`, please `pip install safetensors`')
+            raise
+    if max_shard_size is not None:
+        try:
+            from huggingface_hub import split_torch_state_dict_into_shards
+        except ImportError:
+            print('If you want to use `max_shard_size`, please `pip install huggingface_hub`')
+            raise
+
+    # Convert zero checkpoint to state_dict
+    state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir,
+                                                          tag,
+                                                          exclude_frozen_parameters,
+                                                          lazy_mode=True)
+
+    # Shard the model if it is too big.
+    weights_name = "model.safetensors" if safe_serialization else "pytorch_model.bin"
+    if max_shard_size is not None:
+        filename_pattern = weights_name.replace(".bin", "{suffix}.bin").replace(".safetensors", "{suffix}.safetensors")
+        # an memory-efficient approach for sharding
+        empty_state_dict = to_torch_tensor(state_dict, return_empty_tensor=True)
+        state_dict_split = split_torch_state_dict_into_shards(empty_state_dict,
+                                                              filename_pattern=filename_pattern,
+                                                              max_shard_size=max_shard_size)
+    else:
+        from collections import namedtuple
+        StateDictSplit = namedtuple("StateDictSplit", ["is_sharded", "filename_to_tensors"])
+        state_dict_split = StateDictSplit(is_sharded=False,
+                                          filename_to_tensors={weights_name: list(state_dict.keys())})
+
+    # Save the model by shard
+    os.makedirs(output_dir, exist_ok=True)
+    filename_to_tensors = state_dict_split.filename_to_tensors.items()
+    for shard_file, tensors in tqdm(filename_to_tensors, desc="Saving checkpoint shards"):
+        shard_state_dict = {tensor_name: state_dict[tensor_name] for tensor_name in tensors}
+        shard_state_dict = to_torch_tensor(shard_state_dict)
+        output_path = os.path.join(output_dir, shard_file)
+        if safe_serialization:
+            save_file(shard_state_dict, output_path, metadata={"format": "pt"})
+        else:
+            torch.save(shard_state_dict, output_path)
+        # release the memory of current shard
+        for tensor_name in list(shard_state_dict.keys()):
+            del state_dict[tensor_name]
+            del shard_state_dict[tensor_name]
+        del shard_state_dict
+        gc.collect()
+
+    # Save index if sharded
+    if state_dict_split.is_sharded:
+        index = {
+            "metadata": state_dict_split.metadata,
+            "weight_map": state_dict_split.tensor_to_filename,
+        }
+        save_index_file = "model.safetensors.index.json" if safe_serialization else "pytorch_model.bin.index.json"
+        save_index_file = os.path.join(output_dir, save_index_file)
+        with open(save_index_file, "w", encoding="utf-8") as f:
+            content = json.dumps(index, indent=2, sort_keys=True) + "\n"
+            f.write(content)
+
+
+def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None):
+    """
+    1. Put the provided model to cpu
+    2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict``
+    3. Load it into the provided model
+
+    Args:
+        - ``model``: the model object to update
+        - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``)
+        - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14``
+
+    Returns:
+        - ``model`: modified model
+
+    Make sure you have plenty of CPU memory available before you call this function. If you don't
+    have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it
+    conveniently placed for you in the checkpoint folder.
+
+    A typical usage might be ::
+
+        from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint
+        model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir)
+        # submit to model hub or save the model to share with others
+
+    Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context
+    of the same application. i.e. you will need to re-initialize the deepspeed engine, since
+    ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it.
+
+    """
+    logger.info("Extracting fp32 weights")
+    state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag)
+
+    logger.info("Overwriting model with fp32 weights")
+    model = model.cpu()
+    model.load_state_dict(state_dict, strict=False)
+
+    return model
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("checkpoint_dir",
+                        type=str,
+                        help="path to the desired checkpoint folder, e.g., path/checkpoint-12")
+    parser.add_argument("output_dir",
+                        type=str,
+                        help="directory to the pytorch fp32 state_dict output files"
+                        "(e.g. path/checkpoint-12-output/)")
+    parser.add_argument(
+        "--max_shard_size",
+        type=str,
+        default="5GB",
+        help="The maximum size for a checkpoint before being sharded. Checkpoints shard will then be each of size"
+        "lower than this size. If expressed as a string, needs to be digits followed by a unit (like `5MB`"
+        "We default it to 5GB in order for models to be able to run easily on free-tier google colab instances"
+        "without CPU OOM issues.")
+    parser.add_argument(
+        "--safe_serialization",
+        default=False,
+        action='store_true',
+        help="Whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).")
+    parser.add_argument("-t",
+                        "--tag",
+                        type=str,
+                        default=None,
+                        help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1")
+    parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters")
+    parser.add_argument("-d", "--debug", action='store_true', help="enable debug")
+    args = parser.parse_args()
+
+    debug = args.debug
+
+    convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir,
+                                               args.output_dir,
+                                               max_shard_size=args.max_shard_size,
+                                               safe_serialization=args.safe_serialization,
+                                               tag=args.tag,
+                                               exclude_frozen_parameters=args.exclude_frozen_parameters)
diff --git a/dapo_lora_plus_20251202_001141/checkpoint-384/README.md b/dapo_lora_plus_20251202_001141/checkpoint-384/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..b3fac4aca7a7fabb3a0972e6c9281e23853e2816
--- /dev/null
+++ b/dapo_lora_plus_20251202_001141/checkpoint-384/README.md
@@ -0,0 +1,209 @@
+---
+base_model: deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B
+- grpo
+- lora
+- transformers
+- trl
+---
+
+# Model Card for Model ID
+
+<!-- Provide a quick summary of what the model is/does. -->
+
+
+
+## Model Details
+
+### Model Description
+
+<!-- Provide a longer summary of what this model is. -->
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+<!-- Provide the basic links for the model. -->
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+
+### Direct Use
+
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+
+[More Information Needed]
+
+### Recommendations
+
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+
+[More Information Needed]
+
+### Training Procedure
+
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+
+#### Speeds, Sizes, Times [optional]
+
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+
+[More Information Needed]
+
+## Evaluation
+
+<!-- This section describes the evaluation protocols and provides the results. -->
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+<!-- This should link to a Dataset Card if possible. -->
+
+[More Information Needed]
+
+#### Factors
+
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+
+[More Information Needed]
+
+#### Metrics
+
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+<!-- Relevant interpretability work for the model goes here -->
+
+[More Information Needed]
+
+## Environmental Impact
+
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.17.1
\ No newline at end of file
diff --git a/dapo_lora_plus_20251202_001141/checkpoint-384/adapter_config.json b/dapo_lora_plus_20251202_001141/checkpoint-384/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..57b1340e85011632bb78b2fd3b13b455f6b0d622
--- /dev/null
+++ b/dapo_lora_plus_20251202_001141/checkpoint-384/adapter_config.json
@@ -0,0 +1,42 @@
+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B",
+  "bias": "none",
+  "corda_config": null,
+  "eva_config": null,
+  "exclude_modules": null,
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 64,
+  "lora_bias": false,
+  "lora_dropout": 0.05,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "qalora_group_size": 16,
+  "r": 32,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "q_proj",
+    "v_proj",
+    "k_proj",
+    "gate_proj",
+    "down_proj",
+    "up_proj",
+    "o_proj"
+  ],
+  "target_parameters": null,
+  "task_type": "CAUSAL_LM",
+  "trainable_token_indices": null,
+  "use_dora": false,
+  "use_qalora": false,
+  "use_rslora": false
+}
\ No newline at end of file
diff --git a/dapo_lora_plus_20251202_001141/checkpoint-384/chat_template.jinja b/dapo_lora_plus_20251202_001141/checkpoint-384/chat_template.jinja
new file mode 100644
index 0000000000000000000000000000000000000000..c2066bd7391c270626e39c9d7124f00360126412
--- /dev/null
+++ b/dapo_lora_plus_20251202_001141/checkpoint-384/chat_template.jinja
@@ -0,0 +1 @@
+{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% set ns = namespace(is_first=false, is_tool=false, is_output_first=true, system_prompt='') %}{%- for message in messages %}{%- if message['role'] == 'system' %}{% set ns.system_prompt = message['content'] %}{%- endif %}{%- endfor %}{{bos_token}}{{ns.system_prompt}}{%- for message in messages %}{%- if message['role'] == 'user' %}{%- set ns.is_tool = false -%}{{'<｜User｜>' + message['content']}}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is none %}{%- set ns.is_tool = false -%}{%- for tool in message['tool_calls']%}{%- if not ns.is_first %}{{'<｜Assistant｜><｜tool▁calls▁begin｜><｜tool▁call▁begin｜>' + tool['type'] + '<｜tool▁sep｜>' + tool['function']['name'] + '\n' + '```json' + '\n' + tool['function']['arguments'] + '\n' + '```' + '<｜tool▁call▁end｜>'}}{%- set ns.is_first = true -%}{%- else %}{{'\n' + '<｜tool▁call▁begin｜>' + tool['type'] + '<｜tool▁sep｜>' + tool['function']['name'] + '\n' + '```json' + '\n' + tool['function']['arguments'] + '\n' + '```' + '<｜tool▁call▁end｜>'}}{{'<｜tool▁calls▁end｜><｜end▁of▁sentence｜>'}}{%- endif %}{%- endfor %}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is not none %}{%- if ns.is_tool %}{{'<｜tool▁outputs▁end｜>' + message['content'] + '<｜end▁of▁sentence｜>'}}{%- set ns.is_tool = false -%}{%- else %}{% set content = message['content'] %}{% if '</think>' in content %}{% set content = content.split('</think>')[-1] %}{% endif %}{{'<｜Assistant｜>' + content + '<｜end▁of▁sentence｜>'}}{%- endif %}{%- endif %}{%- if message['role'] == 'tool' %}{%- set ns.is_tool = true -%}{%- if ns.is_output_first %}{{'<｜tool▁outputs▁begin｜><｜tool▁output▁begin｜>' + message['content'] + '<｜tool▁output▁end｜>'}}{%- set ns.is_output_first = false %}{%- else %}{{'\n<｜tool▁output▁begin｜>' + message['content'] + '<｜tool▁output▁end｜>'}}{%- endif %}{%- endif %}{%- endfor -%}{% if ns.is_tool %}{{'<｜tool▁outputs▁end｜>'}}{% endif %}{% if add_generation_prompt and not ns.is_tool %}{{'<｜Assistant｜><think>\n'}}{% endif %}
\ No newline at end of file
diff --git a/dapo_lora_plus_20251202_001141/checkpoint-384/latest b/dapo_lora_plus_20251202_001141/checkpoint-384/latest
new file mode 100644
index 0000000000000000000000000000000000000000..47a30b050fc0cf5b9cd367ab63c36191546d4ff7
--- /dev/null
+++ b/dapo_lora_plus_20251202_001141/checkpoint-384/latest
@@ -0,0 +1 @@
+global_step384
\ No newline at end of file
diff --git a/dapo_lora_plus_20251202_001141/checkpoint-384/special_tokens_map.json b/dapo_lora_plus_20251202_001141/checkpoint-384/special_tokens_map.json
new file mode 100644
index 0000000000000000000000000000000000000000..1d385d62cf08bca35254547902b792c243656ec1
--- /dev/null
+++ b/dapo_lora_plus_20251202_001141/checkpoint-384/special_tokens_map.json
@@ -0,0 +1,23 @@
+{
+  "bos_token": {
+    "content": "<｜begin▁of▁sentence｜>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "<｜end▁of▁sentence｜>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<｜end▁of▁sentence｜>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}
diff --git a/dapo_lora_plus_20251202_001141/checkpoint-384/tokenizer_config.json b/dapo_lora_plus_20251202_001141/checkpoint-384/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..d252dd4e5764106823080946500c02a8ed8c90c9
--- /dev/null
+++ b/dapo_lora_plus_20251202_001141/checkpoint-384/tokenizer_config.json
@@ -0,0 +1,194 @@
+{
+  "add_bos_token": true,
+  "add_eos_token": false,
+  "add_prefix_space": null,
+  "added_tokens_decoder": {
+    "151643": {
+      "content": "<｜end▁of▁sentence｜>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151644": {
+      "content": "<｜User｜>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151645": {
+      "content": "<｜Assistant｜>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151646": {
+      "content": "<｜begin▁of▁sentence｜>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151647": {
+      "content": "<|EOT|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151648": {
+      "content": "<think>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151649": {
+      "content": "</think>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151650": {
+      "content": "<|quad_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151651": {
+      "content": "<|quad_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151652": {
+      "content": "<|vision_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151653": {
+      "content": "<|vision_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151654": {
+      "content": "<|vision_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151655": {
+      "content": "<|image_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151656": {
+      "content": "<|video_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151657": {
+      "content": "<tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151658": {
+      "content": "</tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151659": {
+      "content": "<|fim_prefix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151660": {
+      "content": "<|fim_middle|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151661": {
+      "content": "<|fim_suffix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151662": {
+      "content": "<|fim_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151663": {
+      "content": "<|repo_name|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151664": {
+      "content": "<|file_sep|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    }
+  },
+  "bos_token": "<｜begin▁of▁sentence｜>",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<｜end▁of▁sentence｜>",
+  "extra_special_tokens": {},
+  "legacy": true,
+  "model_max_length": 16384,
+  "pad_token": "<｜end▁of▁sentence｜>",
+  "sp_model_kwargs": {},
+  "tokenizer_class": "LlamaTokenizerFast",
+  "unk_token": null,
+  "use_default_system_prompt": false
+}
diff --git a/dapo_lora_plus_20251202_001141/checkpoint-384/trainer_state.json b/dapo_lora_plus_20251202_001141/checkpoint-384/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..6376071922fbdb42eef3a3f5ed4dc7eeb5391aac
--- /dev/null
+++ b/dapo_lora_plus_20251202_001141/checkpoint-384/trainer_state.json
@@ -0,0 +1,11938 @@
+{
+  "best_global_step": null,
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 0.3532658693652254,
+  "eval_steps": 500,
+  "global_step": 384,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15689.0,
+      "completions/max_terminated_length": 15689.0,
+      "completions/mean_length": 6039.171875,
+      "completions/mean_terminated_length": 6039.171875,
+      "completions/min_length": 250.0,
+      "completions/min_terminated_length": 250.0,
+      "entropy": 1.19118632376194,
+      "epoch": 0.0009199632014719411,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0025745572056621313,
+      "learning_rate": 1e-05,
+      "loss": 0.0591,
+      "num_tokens": 792270.0,
+      "reward": 0.25,
+      "reward_std": 0.24435341358184814,
+      "rewards/accuracy_reward/mean": 0.25,
+      "rewards/accuracy_reward/std": 0.434714138507843,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999940395355225,
+      "sampling/importance_sampling_ratio/min": 0.0002457273658365011,
+      "sampling/sampling_logp_difference/max": 8.311287879943848,
+      "sampling/sampling_logp_difference/mean": 0.021642697975039482,
+      "step": 1
+    },
+    {
+      "clip_ratio/high_max": 5.499582130141789e-06,
+      "clip_ratio/high_mean": 1.3748955325354473e-06,
+      "clip_ratio/low_mean": 2.871888784738985e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.009378326623846e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 16292.0,
+      "completions/max_terminated_length": 16292.0,
+      "completions/mean_length": 4767.1875,
+      "completions/mean_terminated_length": 4767.1875,
+      "completions/min_length": 556.0,
+      "completions/min_terminated_length": 556.0,
+      "entropy": 1.088237851858139,
+      "epoch": 0.0018399264029438822,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.002068034838885069,
+      "learning_rate": 1e-05,
+      "loss": 0.0258,
+      "num_tokens": 1425798.0,
+      "reward": 0.3046875,
+      "reward_std": 0.29036980867385864,
+      "rewards/accuracy_reward/mean": 0.3046875,
+      "rewards/accuracy_reward/std": 0.46208351850509644,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999016523361206,
+      "sampling/importance_sampling_ratio/min": 0.01811397261917591,
+      "sampling/sampling_logp_difference/max": 4.011071681976318,
+      "sampling/sampling_logp_difference/mean": 0.01877593621611595,
+      "step": 2
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 4.459846724103045e-05,
+      "clip_ratio/low_min": 3.4060874440910993e-06,
+      "clip_ratio/region_mean": 4.459846724103045e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16317.0,
+      "completions/mean_length": 6586.359375,
+      "completions/mean_terminated_length": 6351.21630859375,
+      "completions/min_length": 4.0,
+      "completions/min_terminated_length": 4.0,
+      "entropy": 1.0497623533010483,
+      "epoch": 0.0027598896044158236,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.001971944235265255,
+      "learning_rate": 1e-05,
+      "loss": 0.0199,
+      "num_tokens": 2287420.0,
+      "reward": 0.28125,
+      "reward_std": 0.29143062233924866,
+      "rewards/accuracy_reward/mean": 0.28125,
+      "rewards/accuracy_reward/std": 0.4513758420944214,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999316334724426,
+      "sampling/importance_sampling_ratio/min": 5.356698966352269e-05,
+      "sampling/sampling_logp_difference/max": 9.834577560424805,
+      "sampling/sampling_logp_difference/mean": 0.02137824520468712,
+      "step": 3
+    },
+    {
+      "clip_ratio/high_max": 1.7640652004047297e-05,
+      "clip_ratio/high_mean": 5.48578327652649e-06,
+      "clip_ratio/low_mean": 3.218628648937738e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.767206976590387e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 14690.0,
+      "completions/max_terminated_length": 14690.0,
+      "completions/mean_length": 5448.0234375,
+      "completions/mean_terminated_length": 5448.0234375,
+      "completions/min_length": 707.0,
+      "completions/min_terminated_length": 707.0,
+      "entropy": 1.1134418621659279,
+      "epoch": 0.0036798528058877645,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0016465173102915287,
+      "learning_rate": 1e-05,
+      "loss": 0.0433,
+      "num_tokens": 3009167.0,
+      "reward": 0.2890625,
+      "reward_std": 0.27958330512046814,
+      "rewards/accuracy_reward/mean": 0.2890625,
+      "rewards/accuracy_reward/std": 0.45510825514793396,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999774694442749,
+      "sampling/importance_sampling_ratio/min": 7.889385415182915e-06,
+      "sampling/sampling_logp_difference/max": 11.749992370605469,
+      "sampling/sampling_logp_difference/mean": 0.020580951124429703,
+      "step": 4
+    },
+    {
+      "clip_ratio/high_max": 1.3439519989333348e-05,
+      "clip_ratio/high_mean": 3.359879997333337e-06,
+      "clip_ratio/low_mean": 2.8849915906903334e-05,
+      "clip_ratio/low_min": 8.467687621305231e-06,
+      "clip_ratio/region_mean": 3.220979442630778e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 13420.0,
+      "completions/mean_length": 5436.8671875,
+      "completions/mean_terminated_length": 5350.66943359375,
+      "completions/min_length": 26.0,
+      "completions/min_terminated_length": 26.0,
+      "entropy": 1.1473859176039696,
+      "epoch": 0.004599816007359705,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0023770295083522797,
+      "learning_rate": 1e-05,
+      "loss": 0.0153,
+      "num_tokens": 3725654.0,
+      "reward": 0.2734375,
+      "reward_std": 0.27434611320495605,
+      "rewards/accuracy_reward/mean": 0.2734375,
+      "rewards/accuracy_reward/std": 0.447474867105484,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.99991774559021,
+      "sampling/importance_sampling_ratio/min": 0.0011146117467433214,
+      "sampling/sampling_logp_difference/max": 6.799249172210693,
+      "sampling/sampling_logp_difference/mean": 0.020377254113554955,
+      "step": 5
+    },
+    {
+      "clip_ratio/high_max": 4.652201369026443e-06,
+      "clip_ratio/high_mean": 1.1630503422566107e-06,
+      "clip_ratio/low_mean": 2.8399212624208303e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.9562263534899103e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 14440.0,
+      "completions/max_terminated_length": 14440.0,
+      "completions/mean_length": 4697.5390625,
+      "completions/mean_terminated_length": 4697.5390625,
+      "completions/min_length": 445.0,
+      "completions/min_terminated_length": 445.0,
+      "entropy": 1.0097229778766632,
+      "epoch": 0.005519779208831647,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.003342699259519577,
+      "learning_rate": 1e-05,
+      "loss": 0.0326,
+      "num_tokens": 4345547.0,
+      "reward": 0.390625,
+      "reward_std": 0.34480881690979004,
+      "rewards/accuracy_reward/mean": 0.390625,
+      "rewards/accuracy_reward/std": 0.4898075461387634,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999914765357971,
+      "sampling/importance_sampling_ratio/min": 0.002385853324085474,
+      "sampling/sampling_logp_difference/max": 6.038198471069336,
+      "sampling/sampling_logp_difference/mean": 0.0185473021119833,
+      "step": 6
+    },
+    {
+      "clip_ratio/high_max": 9.362594937556423e-06,
+      "clip_ratio/high_mean": 2.340648734389106e-06,
+      "clip_ratio/low_mean": 6.054362825125281e-05,
+      "clip_ratio/low_min": 7.427356649714056e-06,
+      "clip_ratio/region_mean": 6.288427744038927e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14652.0,
+      "completions/mean_length": 6218.2109375,
+      "completions/mean_terminated_length": 5890.2822265625,
+      "completions/min_length": 156.0,
+      "completions/min_terminated_length": 156.0,
+      "entropy": 1.0579778030514717,
+      "epoch": 0.006439742410303588,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.002073560608550906,
+      "learning_rate": 1e-05,
+      "loss": 0.0201,
+      "num_tokens": 5160646.0,
+      "reward": 0.2109375,
+      "reward_std": 0.27222445607185364,
+      "rewards/accuracy_reward/mean": 0.2109375,
+      "rewards/accuracy_reward/std": 0.4095771610736847,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999560117721558,
+      "sampling/importance_sampling_ratio/min": 0.00044544730917550623,
+      "sampling/sampling_logp_difference/max": 7.716431617736816,
+      "sampling/sampling_logp_difference/mean": 0.020321575924754143,
+      "step": 7
+    },
+    {
+      "clip_ratio/high_max": 1.1064067621191498e-05,
+      "clip_ratio/high_mean": 2.7660169052978745e-06,
+      "clip_ratio/low_mean": 2.2175867059104348e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.4941883737028547e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 13637.0,
+      "completions/mean_length": 5127.8359375,
+      "completions/mean_terminated_length": 5039.20458984375,
+      "completions/min_length": 556.0,
+      "completions/min_terminated_length": 556.0,
+      "entropy": 1.0472618415951729,
+      "epoch": 0.007359705611775529,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0032994600478559732,
+      "learning_rate": 1e-05,
+      "loss": 0.0751,
+      "num_tokens": 5836289.0,
+      "reward": 0.3359375,
+      "reward_std": 0.2948455810546875,
+      "rewards/accuracy_reward/mean": 0.3359375,
+      "rewards/accuracy_reward/std": 0.47417303919792175,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999483227729797,
+      "sampling/importance_sampling_ratio/min": 0.0013780994340777397,
+      "sampling/sampling_logp_difference/max": 6.587049961090088,
+      "sampling/sampling_logp_difference/mean": 0.01940803974866867,
+      "step": 8
+    },
+    {
+      "clip_ratio/high_max": 1.2357884770608507e-05,
+      "clip_ratio/high_mean": 3.0894711926521268e-06,
+      "clip_ratio/low_mean": 3.000627111759968e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.309574231025181e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15916.0,
+      "completions/mean_length": 4516.890625,
+      "completions/mean_terminated_length": 4423.44873046875,
+      "completions/min_length": 238.0,
+      "completions/min_terminated_length": 238.0,
+      "entropy": 0.911251038312912,
+      "epoch": 0.00827966881324747,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.003016560571268201,
+      "learning_rate": 1e-05,
+      "loss": 0.1006,
+      "num_tokens": 6433171.0,
+      "reward": 0.390625,
+      "reward_std": 0.3066929578781128,
+      "rewards/accuracy_reward/mean": 0.390625,
+      "rewards/accuracy_reward/std": 0.4898075461387634,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999179840087891,
+      "sampling/importance_sampling_ratio/min": 0.005480794236063957,
+      "sampling/sampling_logp_difference/max": 5.206505298614502,
+      "sampling/sampling_logp_difference/mean": 0.017437148839235306,
+      "step": 9
+    },
+    {
+      "clip_ratio/high_max": 4.6329013457580004e-05,
+      "clip_ratio/high_mean": 1.1582253364395001e-05,
+      "clip_ratio/low_mean": 7.069455705277505e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 8.227681109929108e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 13970.0,
+      "completions/mean_length": 4961.453125,
+      "completions/mean_terminated_length": 4687.31201171875,
+      "completions/min_length": 483.0,
+      "completions/min_terminated_length": 483.0,
+      "entropy": 0.6808596402406693,
+      "epoch": 0.00919963201471941,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.0035386616364121437,
+      "learning_rate": 1e-05,
+      "loss": 0.0596,
+      "num_tokens": 7085389.0,
+      "reward": 0.5625,
+      "reward_std": 0.3816363215446472,
+      "rewards/accuracy_reward/mean": 0.5625,
+      "rewards/accuracy_reward/std": 0.49802759289741516,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999173879623413,
+      "sampling/importance_sampling_ratio/min": 0.0002734088629949838,
+      "sampling/sampling_logp_difference/max": 8.20454216003418,
+      "sampling/sampling_logp_difference/mean": 0.01566406339406967,
+      "step": 10
+    },
+    {
+      "clip_ratio/high_max": 2.43190661421977e-05,
+      "clip_ratio/high_mean": 6.079766535549425e-06,
+      "clip_ratio/low_mean": 2.2395396172214532e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.8475162707763957e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14776.0,
+      "completions/mean_length": 4429.40625,
+      "completions/mean_terminated_length": 4335.275390625,
+      "completions/min_length": 25.0,
+      "completions/min_terminated_length": 25.0,
+      "entropy": 0.9181502386927605,
+      "epoch": 0.010119595216191352,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.0022535293828696012,
+      "learning_rate": 1e-05,
+      "loss": 0.0031,
+      "num_tokens": 7672185.0,
+      "reward": 0.3671875,
+      "reward_std": 0.20357418060302734,
+      "rewards/accuracy_reward/mean": 0.3671875,
+      "rewards/accuracy_reward/std": 0.4839322865009308,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998801946640015,
+      "sampling/importance_sampling_ratio/min": 5.315856554943821e-08,
+      "sampling/sampling_logp_difference/max": 16.74998664855957,
+      "sampling/sampling_logp_difference/mean": 0.018429335206747055,
+      "step": 11
+    },
+    {
+      "clip_ratio/high_max": 1.0117325928149512e-05,
+      "clip_ratio/high_mean": 2.529331482037378e-06,
+      "clip_ratio/low_mean": 1.1982813475697185e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 1.45121450714214e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14029.0,
+      "completions/mean_length": 5282.6796875,
+      "completions/mean_terminated_length": 5106.46875,
+      "completions/min_length": 323.0,
+      "completions/min_terminated_length": 323.0,
+      "entropy": 1.113751620054245,
+      "epoch": 0.011039558417663294,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0013591813622042537,
+      "learning_rate": 1e-05,
+      "loss": 0.0971,
+      "num_tokens": 8369000.0,
+      "reward": 0.3984375,
+      "reward_std": 0.3029736578464508,
+      "rewards/accuracy_reward/mean": 0.3984375,
+      "rewards/accuracy_reward/std": 0.4915000796318054,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998897314071655,
+      "sampling/importance_sampling_ratio/min": 3.970265970565379e-05,
+      "sampling/sampling_logp_difference/max": 10.134092330932617,
+      "sampling/sampling_logp_difference/mean": 0.020221836864948273,
+      "step": 12
+    },
+    {
+      "clip_ratio/high_max": 5.411958227341529e-06,
+      "clip_ratio/high_mean": 1.3529895568353822e-06,
+      "clip_ratio/low_mean": 2.5284593846208736e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.6637583516730956e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15925.0,
+      "completions/mean_length": 6970.421875,
+      "completions/mean_terminated_length": 6744.49609375,
+      "completions/min_length": 53.0,
+      "completions/min_terminated_length": 53.0,
+      "entropy": 1.1721933633089066,
+      "epoch": 0.011959521619135235,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.0024079051800072193,
+      "learning_rate": 1e-05,
+      "loss": 0.0713,
+      "num_tokens": 9283182.0,
+      "reward": 0.171875,
+      "reward_std": 0.17965975403785706,
+      "rewards/accuracy_reward/mean": 0.171875,
+      "rewards/accuracy_reward/std": 0.3787541687488556,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999163746833801,
+      "sampling/importance_sampling_ratio/min": 0.0008915197686292231,
+      "sampling/sampling_logp_difference/max": 7.0225830078125,
+      "sampling/sampling_logp_difference/mean": 0.021462474018335342,
+      "step": 13
+    },
+    {
+      "clip_ratio/high_max": 2.0661535927501973e-05,
+      "clip_ratio/high_mean": 5.165383981875493e-06,
+      "clip_ratio/low_mean": 2.4304956298237812e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.947033948430544e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 14658.0,
+      "completions/max_terminated_length": 14658.0,
+      "completions/mean_length": 4886.875,
+      "completions/mean_terminated_length": 4886.875,
+      "completions/min_length": 434.0,
+      "completions/min_terminated_length": 434.0,
+      "entropy": 1.0108910650014877,
+      "epoch": 0.012879484820607176,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.002063734456896782,
+      "learning_rate": 1e-05,
+      "loss": 0.0386,
+      "num_tokens": 9928446.0,
+      "reward": 0.3515625,
+      "reward_std": 0.2409384697675705,
+      "rewards/accuracy_reward/mean": 0.3515625,
+      "rewards/accuracy_reward/std": 0.4793342351913452,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000026226043701,
+      "sampling/importance_sampling_ratio/min": 0.0003672837920021266,
+      "sampling/sampling_logp_difference/max": 7.9093756675720215,
+      "sampling/sampling_logp_difference/mean": 0.01918785460293293,
+      "step": 14
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 4.4761846993424115e-06,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.4761846993424115e-06,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 12992.0,
+      "completions/max_terminated_length": 12992.0,
+      "completions/mean_length": 4824.0078125,
+      "completions/mean_terminated_length": 4824.0078125,
+      "completions/min_length": 103.0,
+      "completions/min_terminated_length": 103.0,
+      "entropy": 1.1070282831788063,
+      "epoch": 0.013799448022079117,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.002424790756776929,
+      "learning_rate": 1e-05,
+      "loss": 0.0485,
+      "num_tokens": 10566415.0,
+      "reward": 0.28125,
+      "reward_std": 0.23698672652244568,
+      "rewards/accuracy_reward/mean": 0.28125,
+      "rewards/accuracy_reward/std": 0.4513758420944214,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000125169754028,
+      "sampling/importance_sampling_ratio/min": 0.0011708867968991399,
+      "sampling/sampling_logp_difference/max": 6.749993801116943,
+      "sampling/sampling_logp_difference/mean": 0.02069389820098877,
+      "step": 15
+    },
+    {
+      "clip_ratio/high_max": 3.5075904634140898e-06,
+      "clip_ratio/high_mean": 8.768976158535224e-07,
+      "clip_ratio/low_mean": 2.2676964135825983e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.3553861751679506e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 12685.0,
+      "completions/mean_length": 5449.4140625,
+      "completions/mean_terminated_length": 5363.31494140625,
+      "completions/min_length": 23.0,
+      "completions/min_terminated_length": 23.0,
+      "entropy": 0.9817888736724854,
+      "epoch": 0.014719411223551058,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0021046048495918512,
+      "learning_rate": 1e-05,
+      "loss": 0.0252,
+      "num_tokens": 11281908.0,
+      "reward": 0.2265625,
+      "reward_std": 0.27168765664100647,
+      "rewards/accuracy_reward/mean": 0.2265625,
+      "rewards/accuracy_reward/std": 0.4202519655227661,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999805688858032,
+      "sampling/importance_sampling_ratio/min": 0.013273254036903381,
+      "sampling/sampling_logp_difference/max": 4.322004318237305,
+      "sampling/sampling_logp_difference/mean": 0.019556276500225067,
+      "step": 16
+    },
+    {
+      "clip_ratio/high_max": 1.624216065465589e-05,
+      "clip_ratio/high_mean": 4.060540163663973e-06,
+      "clip_ratio/low_mean": 5.4349347919924185e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.840988796990132e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 14133.0,
+      "completions/max_terminated_length": 14133.0,
+      "completions/mean_length": 5343.25,
+      "completions/mean_terminated_length": 5343.25,
+      "completions/min_length": 324.0,
+      "completions/min_terminated_length": 324.0,
+      "entropy": 1.04741720110178,
+      "epoch": 0.015639374425023,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0035894038155674934,
+      "learning_rate": 1e-05,
+      "loss": 0.0584,
+      "num_tokens": 11987692.0,
+      "reward": 0.3359375,
+      "reward_std": 0.3124620020389557,
+      "rewards/accuracy_reward/mean": 0.3359375,
+      "rewards/accuracy_reward/std": 0.47417303919792175,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998996257781982,
+      "sampling/importance_sampling_ratio/min": 2.1446165192173794e-05,
+      "sampling/sampling_logp_difference/max": 10.749964714050293,
+      "sampling/sampling_logp_difference/mean": 0.020530637353658676,
+      "step": 17
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 4.272115029380075e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.272115029380075e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15138.0,
+      "completions/mean_length": 6301.9375,
+      "completions/mean_terminated_length": 5806.09814453125,
+      "completions/min_length": 72.0,
+      "completions/min_terminated_length": 72.0,
+      "entropy": 0.8892941772937775,
+      "epoch": 0.01655933762649494,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0032246762420982122,
+      "learning_rate": 1e-05,
+      "loss": 0.0811,
+      "num_tokens": 12814244.0,
+      "reward": 0.3125,
+      "reward_std": 0.3606000542640686,
+      "rewards/accuracy_reward/mean": 0.3125,
+      "rewards/accuracy_reward/std": 0.4653336703777313,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999184608459473,
+      "sampling/importance_sampling_ratio/min": 0.021351110190153122,
+      "sampling/sampling_logp_difference/max": 3.846651554107666,
+      "sampling/sampling_logp_difference/mean": 0.017541853711009026,
+      "step": 18
+    },
+    {
+      "clip_ratio/high_max": 9.956602298188955e-06,
+      "clip_ratio/high_mean": 2.4891505745472386e-06,
+      "clip_ratio/low_mean": 2.772165316855535e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.0210803743102588e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 16213.0,
+      "completions/max_terminated_length": 16213.0,
+      "completions/mean_length": 5297.46875,
+      "completions/mean_terminated_length": 5297.46875,
+      "completions/min_length": 2.0,
+      "completions/min_terminated_length": 2.0,
+      "entropy": 0.8097029253840446,
+      "epoch": 0.017479300827966882,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0023969109170138836,
+      "learning_rate": 1e-05,
+      "loss": -0.0153,
+      "num_tokens": 13512520.0,
+      "reward": 0.359375,
+      "reward_std": 0.248829185962677,
+      "rewards/accuracy_reward/mean": 0.359375,
+      "rewards/accuracy_reward/std": 0.481702595949173,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999222159385681,
+      "sampling/importance_sampling_ratio/min": 0.005766105372458696,
+      "sampling/sampling_logp_difference/max": 5.155758380889893,
+      "sampling/sampling_logp_difference/mean": 0.017464376986026764,
+      "step": 19
+    },
+    {
+      "clip_ratio/high_max": 1.0098337497765897e-05,
+      "clip_ratio/high_mean": 2.524584374441474e-06,
+      "clip_ratio/low_mean": 3.173396362399217e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.425854845318099e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14655.0,
+      "completions/mean_length": 4890.34375,
+      "completions/mean_terminated_length": 4799.84228515625,
+      "completions/min_length": 68.0,
+      "completions/min_terminated_length": 68.0,
+      "entropy": 0.9267145916819572,
+      "epoch": 0.01839926402943882,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.002759338356554508,
+      "learning_rate": 1e-05,
+      "loss": -0.0014,
+      "num_tokens": 14155556.0,
+      "reward": 0.3515625,
+      "reward_std": 0.31010788679122925,
+      "rewards/accuracy_reward/mean": 0.3515625,
+      "rewards/accuracy_reward/std": 0.4793342351913452,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999570250511169,
+      "sampling/importance_sampling_ratio/min": 0.008491010405123234,
+      "sampling/sampling_logp_difference/max": 4.768747329711914,
+      "sampling/sampling_logp_difference/mean": 0.018839433789253235,
+      "step": 20
+    },
+    {
+      "clip_ratio/high_max": 7.532389190600952e-06,
+      "clip_ratio/high_mean": 1.883097297650238e-06,
+      "clip_ratio/low_mean": 1.9051809317716106e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.0934906729053182e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 16296.0,
+      "completions/max_terminated_length": 16296.0,
+      "completions/mean_length": 4609.40625,
+      "completions/mean_terminated_length": 4609.40625,
+      "completions/min_length": 461.0,
+      "completions/min_terminated_length": 461.0,
+      "entropy": 1.171089917421341,
+      "epoch": 0.019319227230910764,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0021055075339972973,
+      "learning_rate": 1e-05,
+      "loss": -0.0051,
+      "num_tokens": 14765328.0,
+      "reward": 0.2421875,
+      "reward_std": 0.2409384548664093,
+      "rewards/accuracy_reward/mean": 0.2421875,
+      "rewards/accuracy_reward/std": 0.4300905168056488,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999741911888123,
+      "sampling/importance_sampling_ratio/min": 5.368983693188056e-07,
+      "sampling/sampling_logp_difference/max": 14.437457084655762,
+      "sampling/sampling_logp_difference/mean": 0.020226795226335526,
+      "step": 21
+    },
+    {
+      "clip_ratio/high_max": 1.7169573766295798e-05,
+      "clip_ratio/high_mean": 4.2923934415739495e-06,
+      "clip_ratio/low_mean": 5.869748633813288e-06,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 1.0162142189074075e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14299.0,
+      "completions/mean_length": 5099.0390625,
+      "completions/mean_terminated_length": 5010.18115234375,
+      "completions/min_length": 539.0,
+      "completions/min_terminated_length": 539.0,
+      "entropy": 1.005959376692772,
+      "epoch": 0.020239190432382703,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0027595218271017075,
+      "learning_rate": 1e-05,
+      "loss": 0.0236,
+      "num_tokens": 15438549.0,
+      "reward": 0.296875,
+      "reward_std": 0.20069602131843567,
+      "rewards/accuracy_reward/mean": 0.296875,
+      "rewards/accuracy_reward/std": 0.45867621898651123,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999887347221375,
+      "sampling/importance_sampling_ratio/min": 0.00013984869292471558,
+      "sampling/sampling_logp_difference/max": 8.87494945526123,
+      "sampling/sampling_logp_difference/mean": 0.01902824640274048,
+      "step": 22
+    },
+    {
+      "clip_ratio/high_max": 5.162942670722259e-06,
+      "clip_ratio/high_mean": 1.2907356676805648e-06,
+      "clip_ratio/low_mean": 3.6872071063953626e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.816280593582633e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16204.0,
+      "completions/mean_length": 7138.0390625,
+      "completions/mean_terminated_length": 6839.7822265625,
+      "completions/min_length": 729.0,
+      "completions/min_terminated_length": 729.0,
+      "entropy": 1.0403362140059471,
+      "epoch": 0.021159153633854646,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.002748022088780999,
+      "learning_rate": 1e-05,
+      "loss": 0.0647,
+      "num_tokens": 16373898.0,
+      "reward": 0.296875,
+      "reward_std": 0.3169426918029785,
+      "rewards/accuracy_reward/mean": 0.296875,
+      "rewards/accuracy_reward/std": 0.45867621898651123,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999048709869385,
+      "sampling/importance_sampling_ratio/min": 0.0003802926803473383,
+      "sampling/sampling_logp_difference/max": 7.874569416046143,
+      "sampling/sampling_logp_difference/mean": 0.020853528752923012,
+      "step": 23
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 5.6506045439164154e-05,
+      "clip_ratio/low_min": 5.709326615033206e-06,
+      "clip_ratio/region_mean": 5.6506045439164154e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14543.0,
+      "completions/mean_length": 5420.515625,
+      "completions/mean_terminated_length": 5334.18896484375,
+      "completions/min_length": 19.0,
+      "completions/min_terminated_length": 19.0,
+      "entropy": 1.1339883506298065,
+      "epoch": 0.02207911683532659,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0029502976685762405,
+      "learning_rate": 1e-05,
+      "loss": 0.0756,
+      "num_tokens": 17088156.0,
+      "reward": 0.1953125,
+      "reward_std": 0.25620076060295105,
+      "rewards/accuracy_reward/mean": 0.1953125,
+      "rewards/accuracy_reward/std": 0.3979988098144531,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999445676803589,
+      "sampling/importance_sampling_ratio/min": 9.70982582657598e-05,
+      "sampling/sampling_logp_difference/max": 9.239787101745605,
+      "sampling/sampling_logp_difference/mean": 0.0199423898011446,
+      "step": 24
+    },
+    {
+      "clip_ratio/high_max": 5.619998319161823e-06,
+      "clip_ratio/high_mean": 1.4049995797904558e-06,
+      "clip_ratio/low_mean": 6.439320418394345e-05,
+      "clip_ratio/low_min": 4.70632539872895e-06,
+      "clip_ratio/region_mean": 6.57982034226734e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14636.0,
+      "completions/mean_length": 5116.3046875,
+      "completions/mean_terminated_length": 4845.88037109375,
+      "completions/min_length": 10.0,
+      "completions/min_terminated_length": 10.0,
+      "entropy": 0.9503882825374603,
+      "epoch": 0.022999080036798528,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.004891107324510813,
+      "learning_rate": 1e-05,
+      "loss": 0.0522,
+      "num_tokens": 17766619.0,
+      "reward": 0.3203125,
+      "reward_std": 0.3366856575012207,
+      "rewards/accuracy_reward/mean": 0.3203125,
+      "rewards/accuracy_reward/std": 0.4684300124645233,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999970018863678,
+      "sampling/importance_sampling_ratio/min": 0.0010618992382660508,
+      "sampling/sampling_logp_difference/max": 6.847696304321289,
+      "sampling/sampling_logp_difference/mean": 0.01914183795452118,
+      "step": 25
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 3.839018643247982e-05,
+      "clip_ratio/low_min": 4.115091087442124e-06,
+      "clip_ratio/region_mean": 3.839018643247982e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14634.0,
+      "completions/mean_length": 5061.8671875,
+      "completions/mean_terminated_length": 4972.71630859375,
+      "completions/min_length": 281.0,
+      "completions/min_terminated_length": 281.0,
+      "entropy": 1.0540335327386856,
+      "epoch": 0.02391904323827047,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0030373274348676205,
+      "learning_rate": 1e-05,
+      "loss": 0.0246,
+      "num_tokens": 18432938.0,
+      "reward": 0.34375,
+      "reward_std": 0.28118088841438293,
+      "rewards/accuracy_reward/mean": 0.34375,
+      "rewards/accuracy_reward/std": 0.47682511806488037,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999624490737915,
+      "sampling/importance_sampling_ratio/min": 1.7212972807101323e-06,
+      "sampling/sampling_logp_difference/max": 13.272432327270508,
+      "sampling/sampling_logp_difference/mean": 0.019548218697309494,
+      "step": 26
+    },
+    {
+      "clip_ratio/high_max": 1.4656657867817557e-05,
+      "clip_ratio/high_mean": 4.665093399580655e-06,
+      "clip_ratio/low_mean": 3.751162262233265e-05,
+      "clip_ratio/low_min": 4.413062470121076e-06,
+      "clip_ratio/region_mean": 4.2176716192443564e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15782.0,
+      "completions/max_terminated_length": 15782.0,
+      "completions/mean_length": 6349.9765625,
+      "completions/mean_terminated_length": 6349.9765625,
+      "completions/min_length": 2.0,
+      "completions/min_terminated_length": 2.0,
+      "entropy": 1.0268081277608871,
+      "epoch": 0.02483900643974241,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0017623496241867542,
+      "learning_rate": 1e-05,
+      "loss": 0.0011,
+      "num_tokens": 19264743.0,
+      "reward": 0.2734375,
+      "reward_std": 0.33903974294662476,
+      "rewards/accuracy_reward/mean": 0.2734375,
+      "rewards/accuracy_reward/std": 0.447474867105484,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000061988830566,
+      "sampling/importance_sampling_ratio/min": 6.870362267363816e-05,
+      "sampling/sampling_logp_difference/max": 9.585708618164062,
+      "sampling/sampling_logp_difference/mean": 0.019106190651655197,
+      "step": 27
+    },
+    {
+      "clip_ratio/high_max": 9.221375876222737e-06,
+      "clip_ratio/high_mean": 2.3053439690556843e-06,
+      "clip_ratio/low_mean": 3.09787185415189e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.328406273794826e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15944.0,
+      "completions/mean_length": 5815.484375,
+      "completions/mean_terminated_length": 5561.84033203125,
+      "completions/min_length": 607.0,
+      "completions/min_terminated_length": 607.0,
+      "entropy": 1.0389493256807327,
+      "epoch": 0.025758969641214352,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.003111837198957801,
+      "learning_rate": 1e-05,
+      "loss": -0.0162,
+      "num_tokens": 20030109.0,
+      "reward": 0.34375,
+      "reward_std": 0.32719242572784424,
+      "rewards/accuracy_reward/mean": 0.34375,
+      "rewards/accuracy_reward/std": 0.47682511806488037,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000298023223877,
+      "sampling/importance_sampling_ratio/min": 0.02987043187022209,
+      "sampling/sampling_logp_difference/max": 3.5108861923217773,
+      "sampling/sampling_logp_difference/mean": 0.020060991868376732,
+      "step": 28
+    },
+    {
+      "clip_ratio/high_max": 6.7810142354574054e-06,
+      "clip_ratio/high_mean": 1.6952535588643514e-06,
+      "clip_ratio/low_mean": 4.474762545214617e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.644287901101052e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15371.0,
+      "completions/mean_length": 5157.1484375,
+      "completions/mean_terminated_length": 5068.748046875,
+      "completions/min_length": 11.0,
+      "completions/min_terminated_length": 11.0,
+      "entropy": 1.0510126948356628,
+      "epoch": 0.02667893284268629,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.003041633637621999,
+      "learning_rate": 1e-05,
+      "loss": 0.0471,
+      "num_tokens": 20710904.0,
+      "reward": 0.3125,
+      "reward_std": 0.35612428188323975,
+      "rewards/accuracy_reward/mean": 0.3125,
+      "rewards/accuracy_reward/std": 0.4653336703777313,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999587535858154,
+      "sampling/importance_sampling_ratio/min": 0.04357198625802994,
+      "sampling/sampling_logp_difference/max": 3.133340835571289,
+      "sampling/sampling_logp_difference/mean": 0.019007597118616104,
+      "step": 29
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 2.0962848566341563e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.0962848566341563e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15333.0,
+      "completions/max_terminated_length": 15333.0,
+      "completions/mean_length": 4446.3828125,
+      "completions/mean_terminated_length": 4446.3828125,
+      "completions/min_length": 73.0,
+      "completions/min_terminated_length": 73.0,
+      "entropy": 1.053279548883438,
+      "epoch": 0.027598896044158234,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0022369560319930315,
+      "learning_rate": 1e-05,
+      "loss": -0.001,
+      "num_tokens": 21298497.0,
+      "reward": 0.390625,
+      "reward_std": 0.24169495701789856,
+      "rewards/accuracy_reward/mean": 0.390625,
+      "rewards/accuracy_reward/std": 0.4898075461387634,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998750686645508,
+      "sampling/importance_sampling_ratio/min": 0.006704842206090689,
+      "sampling/sampling_logp_difference/max": 5.00492525100708,
+      "sampling/sampling_logp_difference/mean": 0.01947362720966339,
+      "step": 30
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 2.8460265411922592e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.8460265411922592e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15386.0,
+      "completions/mean_length": 6294.1484375,
+      "completions/mean_terminated_length": 6133.9921875,
+      "completions/min_length": 548.0,
+      "completions/min_terminated_length": 548.0,
+      "entropy": 1.2036212533712387,
+      "epoch": 0.028518859245630176,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0021383841522037983,
+      "learning_rate": 1e-05,
+      "loss": 0.033,
+      "num_tokens": 22124812.0,
+      "reward": 0.171875,
+      "reward_std": 0.20752590894699097,
+      "rewards/accuracy_reward/mean": 0.171875,
+      "rewards/accuracy_reward/std": 0.3787541687488556,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999858736991882,
+      "sampling/importance_sampling_ratio/min": 3.9575263599544996e-07,
+      "sampling/sampling_logp_difference/max": 14.742476463317871,
+      "sampling/sampling_logp_difference/mean": 0.022367021068930626,
+      "step": 31
+    },
+    {
+      "clip_ratio/high_max": 1.73864664247958e-05,
+      "clip_ratio/high_mean": 4.34661660619895e-06,
+      "clip_ratio/low_mean": 3.19569651310303e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.630358173722925e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14893.0,
+      "completions/mean_length": 6011.4921875,
+      "completions/mean_terminated_length": 5929.81884765625,
+      "completions/min_length": 2.0,
+      "completions/min_terminated_length": 2.0,
+      "entropy": 1.123318687081337,
+      "epoch": 0.029438822447102116,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.00126531848218292,
+      "learning_rate": 1e-05,
+      "loss": 0.0119,
+      "num_tokens": 22915091.0,
+      "reward": 0.171875,
+      "reward_std": 0.2330477386713028,
+      "rewards/accuracy_reward/mean": 0.171875,
+      "rewards/accuracy_reward/std": 0.3787541687488556,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999861121177673,
+      "sampling/importance_sampling_ratio/min": 1.6368276192224585e-05,
+      "sampling/sampling_logp_difference/max": 11.02016544342041,
+      "sampling/sampling_logp_difference/mean": 0.019905246794223785,
+      "step": 32
+    },
+    {
+      "clip_ratio/high_max": 2.8753217975463485e-05,
+      "clip_ratio/high_mean": 7.188304493865871e-06,
+      "clip_ratio/low_mean": 3.818478444372886e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.537308905128157e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16332.0,
+      "completions/mean_length": 5152.46875,
+      "completions/mean_terminated_length": 5064.03125,
+      "completions/min_length": 128.0,
+      "completions/min_terminated_length": 128.0,
+      "entropy": 1.0477670058608055,
+      "epoch": 0.03035878564857406,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0030069497879594564,
+      "learning_rate": 1e-05,
+      "loss": 0.1026,
+      "num_tokens": 23596487.0,
+      "reward": 0.3359375,
+      "reward_std": 0.29142576456069946,
+      "rewards/accuracy_reward/mean": 0.3359375,
+      "rewards/accuracy_reward/std": 0.47417303919792175,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999433755874634,
+      "sampling/importance_sampling_ratio/min": 9.009604013954231e-07,
+      "sampling/sampling_logp_difference/max": 13.919804573059082,
+      "sampling/sampling_logp_difference/mean": 0.019003981724381447,
+      "step": 33
+    },
+    {
+      "clip_ratio/high_max": 3.069575450354023e-05,
+      "clip_ratio/high_mean": 7.673938625885057e-06,
+      "clip_ratio/low_mean": 3.4847614415411954e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.252155258654966e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 12792.0,
+      "completions/max_terminated_length": 12792.0,
+      "completions/mean_length": 4672.5703125,
+      "completions/mean_terminated_length": 4672.5703125,
+      "completions/min_length": 4.0,
+      "completions/min_terminated_length": 4.0,
+      "entropy": 0.9471446052193642,
+      "epoch": 0.031278748850046,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.002676331205293536,
+      "learning_rate": 1e-05,
+      "loss": 0.0724,
+      "num_tokens": 24213408.0,
+      "reward": 0.3203125,
+      "reward_std": 0.2988021969795227,
+      "rewards/accuracy_reward/mean": 0.3203125,
+      "rewards/accuracy_reward/std": 0.4684300124645233,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000251531600952,
+      "sampling/importance_sampling_ratio/min": 0.0013351094676181674,
+      "sampling/sampling_logp_difference/max": 6.618741989135742,
+      "sampling/sampling_logp_difference/mean": 0.0179576613008976,
+      "step": 34
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 2.6127243245355203e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.6127243245355203e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16108.0,
+      "completions/mean_length": 7013.734375,
+      "completions/mean_terminated_length": 6711.4677734375,
+      "completions/min_length": 326.0,
+      "completions/min_terminated_length": 326.0,
+      "entropy": 1.1254516392946243,
+      "epoch": 0.03219871205151794,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0023615453392267227,
+      "learning_rate": 1e-05,
+      "loss": 0.0384,
+      "num_tokens": 25130262.0,
+      "reward": 0.1953125,
+      "reward_std": 0.26485776901245117,
+      "rewards/accuracy_reward/mean": 0.1953125,
+      "rewards/accuracy_reward/std": 0.3979988098144531,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999954342842102,
+      "sampling/importance_sampling_ratio/min": 6.6197676460433286e-06,
+      "sampling/sampling_logp_difference/max": 11.925450325012207,
+      "sampling/sampling_logp_difference/mean": 0.0215257927775383,
+      "step": 35
+    },
+    {
+      "clip_ratio/high_max": 4.06954040954588e-06,
+      "clip_ratio/high_mean": 1.01738510238647e-06,
+      "clip_ratio/low_mean": 4.180071573500754e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.281810015527299e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15673.0,
+      "completions/mean_length": 5858.59375,
+      "completions/mean_terminated_length": 5605.984375,
+      "completions/min_length": 189.0,
+      "completions/min_terminated_length": 189.0,
+      "entropy": 1.0713739022612572,
+      "epoch": 0.03311867525298988,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0029018481727689505,
+      "learning_rate": 1e-05,
+      "loss": 0.1041,
+      "num_tokens": 25898194.0,
+      "reward": 0.3671875,
+      "reward_std": 0.29036980867385864,
+      "rewards/accuracy_reward/mean": 0.3671875,
+      "rewards/accuracy_reward/std": 0.4839322865009308,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999915957450867,
+      "sampling/importance_sampling_ratio/min": 1.6834765119710937e-05,
+      "sampling/sampling_logp_difference/max": 10.992064476013184,
+      "sampling/sampling_logp_difference/mean": 0.019959844648838043,
+      "step": 36
+    },
+    {
+      "clip_ratio/high_max": 1.2810827229259303e-05,
+      "clip_ratio/high_mean": 3.2027068073148257e-06,
+      "clip_ratio/low_mean": 3.29701083501277e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.617281504375569e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14004.0,
+      "completions/mean_length": 6952.6015625,
+      "completions/mean_terminated_length": 6726.24853515625,
+      "completions/min_length": 3.0,
+      "completions/min_terminated_length": 3.0,
+      "entropy": 1.028619796037674,
+      "epoch": 0.03403863845446182,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0022342968732118607,
+      "learning_rate": 1e-05,
+      "loss": 0.0637,
+      "num_tokens": 26812791.0,
+      "reward": 0.234375,
+      "reward_std": 0.26827272772789,
+      "rewards/accuracy_reward/mean": 0.234375,
+      "rewards/accuracy_reward/std": 0.42527204751968384,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999532699584961,
+      "sampling/importance_sampling_ratio/min": 4.540153167909011e-05,
+      "sampling/sampling_logp_difference/max": 9.999964714050293,
+      "sampling/sampling_logp_difference/mean": 0.02002539485692978,
+      "step": 37
+    },
+    {
+      "clip_ratio/high_max": 1.5225089100567857e-05,
+      "clip_ratio/high_mean": 6.960676159906143e-06,
+      "clip_ratio/low_mean": 4.09088329433871e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.7869508762232726e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16361.0,
+      "completions/mean_length": 6413.421875,
+      "completions/mean_terminated_length": 6174.12841796875,
+      "completions/min_length": 2.0,
+      "completions/min_terminated_length": 2.0,
+      "entropy": 0.9452399462461472,
+      "epoch": 0.034958601655933765,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0021800603717565536,
+      "learning_rate": 1e-05,
+      "loss": 0.0275,
+      "num_tokens": 27652757.0,
+      "reward": 0.296875,
+      "reward_std": 0.31246688961982727,
+      "rewards/accuracy_reward/mean": 0.296875,
+      "rewards/accuracy_reward/std": 0.45867621898651123,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999439120292664,
+      "sampling/importance_sampling_ratio/min": 3.895394547726028e-05,
+      "sampling/sampling_logp_difference/max": 10.153130531311035,
+      "sampling/sampling_logp_difference/mean": 0.019722118973731995,
+      "step": 38
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 3.9564903318023426e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.9564903318023426e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15754.0,
+      "completions/max_terminated_length": 15754.0,
+      "completions/mean_length": 5176.3515625,
+      "completions/mean_terminated_length": 5176.3515625,
+      "completions/min_length": 130.0,
+      "completions/min_terminated_length": 130.0,
+      "entropy": 1.0444758981466293,
+      "epoch": 0.035878564857405704,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.004153470974415541,
+      "learning_rate": 1e-05,
+      "loss": 0.0798,
+      "num_tokens": 28334386.0,
+      "reward": 0.2734375,
+      "reward_std": 0.32035762071609497,
+      "rewards/accuracy_reward/mean": 0.2734375,
+      "rewards/accuracy_reward/std": 0.447474867105484,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999774694442749,
+      "sampling/importance_sampling_ratio/min": 0.007421077694743872,
+      "sampling/sampling_logp_difference/max": 4.903430938720703,
+      "sampling/sampling_logp_difference/mean": 0.020159056410193443,
+      "step": 39
+    },
+    {
+      "clip_ratio/high_max": 1.725743459246587e-05,
+      "clip_ratio/high_mean": 4.3143586481164675e-06,
+      "clip_ratio/low_mean": 2.0204584302518924e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.451894306432223e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15554.0,
+      "completions/mean_length": 5178.9921875,
+      "completions/mean_terminated_length": 5001.13525390625,
+      "completions/min_length": 4.0,
+      "completions/min_terminated_length": 4.0,
+      "entropy": 1.0803537145256996,
+      "epoch": 0.03679852805887764,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.002477057045325637,
+      "learning_rate": 1e-05,
+      "loss": 0.0067,
+      "num_tokens": 29017145.0,
+      "reward": 0.2890625,
+      "reward_std": 0.29932135343551636,
+      "rewards/accuracy_reward/mean": 0.2890625,
+      "rewards/accuracy_reward/std": 0.45510825514793396,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000497102737427,
+      "sampling/importance_sampling_ratio/min": 0.004630985204130411,
+      "sampling/sampling_logp_difference/max": 5.374985694885254,
+      "sampling/sampling_logp_difference/mean": 0.019826076924800873,
+      "step": 40
+    },
+    {
+      "clip_ratio/high_max": 1.6637992303003557e-05,
+      "clip_ratio/high_mean": 4.159498075750889e-06,
+      "clip_ratio/low_mean": 2.1970684144889674e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.6130182106953725e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 14131.0,
+      "completions/max_terminated_length": 14131.0,
+      "completions/mean_length": 4980.359375,
+      "completions/mean_terminated_length": 4980.359375,
+      "completions/min_length": 329.0,
+      "completions/min_terminated_length": 329.0,
+      "entropy": 0.9510642662644386,
+      "epoch": 0.03771849126034959,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0016275218222290277,
+      "learning_rate": 1e-05,
+      "loss": -0.0097,
+      "num_tokens": 29673535.0,
+      "reward": 0.4375,
+      "reward_std": 0.26249876618385315,
+      "rewards/accuracy_reward/mean": 0.4375,
+      "rewards/accuracy_reward/std": 0.49802759289741516,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999750852584839,
+      "sampling/importance_sampling_ratio/min": 0.000599516904912889,
+      "sampling/sampling_logp_difference/max": 7.419386386871338,
+      "sampling/sampling_logp_difference/mean": 0.01844976656138897,
+      "step": 41
+    },
+    {
+      "clip_ratio/high_max": 2.8087193186365766e-05,
+      "clip_ratio/high_mean": 7.021798296591442e-06,
+      "clip_ratio/low_mean": 3.9683913541921356e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.670571286169434e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15328.0,
+      "completions/mean_length": 5778.6953125,
+      "completions/mean_terminated_length": 5695.18896484375,
+      "completions/min_length": 691.0,
+      "completions/min_terminated_length": 691.0,
+      "entropy": 1.0413239300251007,
+      "epoch": 0.03863845446182153,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.001847646082751453,
+      "learning_rate": 1e-05,
+      "loss": -0.0045,
+      "num_tokens": 30436416.0,
+      "reward": 0.2578125,
+      "reward_std": 0.33903977274894714,
+      "rewards/accuracy_reward/mean": 0.2578125,
+      "rewards/accuracy_reward/std": 0.43914902210235596,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998501539230347,
+      "sampling/importance_sampling_ratio/min": 0.00020348970429040492,
+      "sampling/sampling_logp_difference/max": 8.499895095825195,
+      "sampling/sampling_logp_difference/mean": 0.021502099931240082,
+      "step": 42
+    },
+    {
+      "clip_ratio/high_max": 2.68402091023745e-05,
+      "clip_ratio/high_mean": 8.575278570788214e-06,
+      "clip_ratio/low_mean": 4.547183698377921e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.404711600931478e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 14182.0,
+      "completions/max_terminated_length": 14182.0,
+      "completions/mean_length": 4875.125,
+      "completions/mean_terminated_length": 4875.125,
+      "completions/min_length": 349.0,
+      "completions/min_terminated_length": 349.0,
+      "entropy": 1.0464690178632736,
+      "epoch": 0.03955841766329347,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0021134833805263042,
+      "learning_rate": 1e-05,
+      "loss": 0.0727,
+      "num_tokens": 31083672.0,
+      "reward": 0.40625,
+      "reward_std": 0.3584783971309662,
+      "rewards/accuracy_reward/mean": 0.40625,
+      "rewards/accuracy_reward/std": 0.4930621087551117,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999340176582336,
+      "sampling/importance_sampling_ratio/min": 0.012113225646317005,
+      "sampling/sampling_logp_difference/max": 4.41345739364624,
+      "sampling/sampling_logp_difference/mean": 0.019140049815177917,
+      "step": 43
+    },
+    {
+      "clip_ratio/high_max": 3.9877967992651975e-05,
+      "clip_ratio/high_mean": 9.969491998162994e-06,
+      "clip_ratio/low_mean": 3.981287841270387e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.9782369273998484e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15959.0,
+      "completions/mean_length": 4691.421875,
+      "completions/mean_terminated_length": 4505.82568359375,
+      "completions/min_length": 296.0,
+      "completions/min_terminated_length": 296.0,
+      "entropy": 1.0229775309562683,
+      "epoch": 0.040478380864765406,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0037735572550445795,
+      "learning_rate": 1e-05,
+      "loss": 0.0603,
+      "num_tokens": 31703654.0,
+      "reward": 0.4453125,
+      "reward_std": 0.2993389964103699,
+      "rewards/accuracy_reward/mean": 0.4453125,
+      "rewards/accuracy_reward/std": 0.4989531338214874,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999492168426514,
+      "sampling/importance_sampling_ratio/min": 0.03150063753128052,
+      "sampling/sampling_logp_difference/max": 3.457747459411621,
+      "sampling/sampling_logp_difference/mean": 0.01912039890885353,
+      "step": 44
+    },
+    {
+      "clip_ratio/high_max": 3.5441889849607833e-06,
+      "clip_ratio/high_mean": 8.860472462401958e-07,
+      "clip_ratio/low_mean": 1.5137359810069029e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 1.6023407056309225e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15605.0,
+      "completions/mean_length": 6821.96875,
+      "completions/mean_terminated_length": 6592.48046875,
+      "completions/min_length": 1196.0,
+      "completions/min_terminated_length": 1196.0,
+      "entropy": 1.1132484003901482,
+      "epoch": 0.04139834406623735,
+      "frac_reward_zero_std": 0.625,
+      "grad_norm": 0.0010448681423440576,
+      "learning_rate": 1e-05,
+      "loss": 0.022,
+      "num_tokens": 32599778.0,
+      "reward": 0.2265625,
+      "reward_std": 0.1814819872379303,
+      "rewards/accuracy_reward/mean": 0.2265625,
+      "rewards/accuracy_reward/std": 0.4202519655227661,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999915361404419,
+      "sampling/importance_sampling_ratio/min": 0.006500681862235069,
+      "sampling/sampling_logp_difference/max": 5.035848140716553,
+      "sampling/sampling_logp_difference/mean": 0.02125459350645542,
+      "step": 45
+    },
+    {
+      "clip_ratio/high_max": 4.652893949241843e-06,
+      "clip_ratio/high_mean": 1.1632234873104608e-06,
+      "clip_ratio/low_mean": 5.731516603191267e-05,
+      "clip_ratio/low_min": 9.891066838463303e-06,
+      "clip_ratio/region_mean": 5.8478389746596804e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15753.0,
+      "completions/mean_length": 6834.3671875,
+      "completions/mean_terminated_length": 6605.17626953125,
+      "completions/min_length": 624.0,
+      "completions/min_terminated_length": 624.0,
+      "entropy": 0.9827468693256378,
+      "epoch": 0.04231830726770929,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.0017670176457613707,
+      "learning_rate": 1e-05,
+      "loss": 0.1105,
+      "num_tokens": 33492737.0,
+      "reward": 0.3046875,
+      "reward_std": 0.3440523147583008,
+      "rewards/accuracy_reward/mean": 0.3046875,
+      "rewards/accuracy_reward/std": 0.46208351850509644,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999089241027832,
+      "sampling/importance_sampling_ratio/min": 0.0021202093921601772,
+      "sampling/sampling_logp_difference/max": 6.156240463256836,
+      "sampling/sampling_logp_difference/mean": 0.019490526989102364,
+      "step": 46
+    },
+    {
+      "clip_ratio/high_max": 6.717360520269722e-06,
+      "clip_ratio/high_mean": 2.503530367903295e-06,
+      "clip_ratio/low_mean": 2.5672919832686603e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.8176450200589898e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14098.0,
+      "completions/mean_length": 6175.296875,
+      "completions/mean_terminated_length": 5845.98388671875,
+      "completions/min_length": 558.0,
+      "completions/min_terminated_length": 558.0,
+      "entropy": 1.1584237962961197,
+      "epoch": 0.04323827046918123,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.0016891945851966739,
+      "learning_rate": 1e-05,
+      "loss": -0.0008,
+      "num_tokens": 34312455.0,
+      "reward": 0.1875,
+      "reward_std": 0.19673937559127808,
+      "rewards/accuracy_reward/mean": 0.1875,
+      "rewards/accuracy_reward/std": 0.39184603095054626,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999643564224243,
+      "sampling/importance_sampling_ratio/min": 8.086384332273155e-05,
+      "sampling/sampling_logp_difference/max": 9.422743797302246,
+      "sampling/sampling_logp_difference/mean": 0.021749887615442276,
+      "step": 47
+    },
+    {
+      "clip_ratio/high_max": 2.2362002255249536e-05,
+      "clip_ratio/high_mean": 8.189798336388776e-06,
+      "clip_ratio/low_mean": 2.1058204993096297e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.9248002192616696e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16054.0,
+      "completions/mean_length": 6036.8359375,
+      "completions/mean_terminated_length": 5955.3623046875,
+      "completions/min_length": 510.0,
+      "completions/min_terminated_length": 510.0,
+      "entropy": 0.9301538467407227,
+      "epoch": 0.04415823367065318,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.003834392176941037,
+      "learning_rate": 1e-05,
+      "loss": 0.0636,
+      "num_tokens": 35102738.0,
+      "reward": 0.4375,
+      "reward_std": 0.36614155769348145,
+      "rewards/accuracy_reward/mean": 0.4375,
+      "rewards/accuracy_reward/std": 0.49802759289741516,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998494386672974,
+      "sampling/importance_sampling_ratio/min": 0.00013992394087836146,
+      "sampling/sampling_logp_difference/max": 8.874411582946777,
+      "sampling/sampling_logp_difference/mean": 0.019147861748933792,
+      "step": 48
+    },
+    {
+      "clip_ratio/high_max": 1.1501961580506759e-05,
+      "clip_ratio/high_mean": 2.8754903951266897e-06,
+      "clip_ratio/low_mean": 4.08189714562468e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.369446196506033e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15594.0,
+      "completions/mean_length": 6262.46875,
+      "completions/mean_terminated_length": 5764.68798828125,
+      "completions/min_length": 210.0,
+      "completions/min_terminated_length": 210.0,
+      "entropy": 0.8599015846848488,
+      "epoch": 0.045078196872125116,
+      "frac_reward_zero_std": 0.125,
+      "grad_norm": 0.0029804729856550694,
+      "learning_rate": 1e-05,
+      "loss": 0.0495,
+      "num_tokens": 35924886.0,
+      "reward": 0.3984375,
+      "reward_std": 0.3911295533180237,
+      "rewards/accuracy_reward/mean": 0.3984375,
+      "rewards/accuracy_reward/std": 0.4915000796318054,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999922513961792,
+      "sampling/importance_sampling_ratio/min": 0.00021375219512265176,
+      "sampling/sampling_logp_difference/max": 9.904524803161621,
+      "sampling/sampling_logp_difference/mean": 0.01815103553235531,
+      "step": 49
+    },
+    {
+      "clip_ratio/high_max": 2.4107544049911667e-05,
+      "clip_ratio/high_mean": 6.026886012477917e-06,
+      "clip_ratio/low_mean": 3.6588148361715866e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.261503391944643e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 14556.0,
+      "completions/max_terminated_length": 14556.0,
+      "completions/mean_length": 5926.8984375,
+      "completions/mean_terminated_length": 5926.8984375,
+      "completions/min_length": 346.0,
+      "completions/min_terminated_length": 346.0,
+      "entropy": 1.0042993426322937,
+      "epoch": 0.045998160073597055,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0022071697749197483,
+      "learning_rate": 1e-05,
+      "loss": 0.0059,
+      "num_tokens": 36700913.0,
+      "reward": 0.3359375,
+      "reward_std": 0.3306073546409607,
+      "rewards/accuracy_reward/mean": 0.3359375,
+      "rewards/accuracy_reward/std": 0.47417303919792175,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.000010371208191,
+      "sampling/importance_sampling_ratio/min": 0.0005220364546403289,
+      "sampling/sampling_logp_difference/max": 7.557773113250732,
+      "sampling/sampling_logp_difference/mean": 0.01954064890742302,
+      "step": 50
+    },
+    {
+      "clip_ratio/high_max": 4.9106265578302555e-06,
+      "clip_ratio/high_mean": 1.2276566394575639e-06,
+      "clip_ratio/low_mean": 2.634599570683349e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.7573652346291055e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15217.0,
+      "completions/mean_length": 6873.6875,
+      "completions/mean_terminated_length": 6645.4404296875,
+      "completions/min_length": 505.0,
+      "completions/min_terminated_length": 505.0,
+      "entropy": 1.0255412608385086,
+      "epoch": 0.046918123275068994,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.002320924773812294,
+      "learning_rate": 1e-05,
+      "loss": 0.0508,
+      "num_tokens": 37604865.0,
+      "reward": 0.234375,
+      "reward_std": 0.3135228157043457,
+      "rewards/accuracy_reward/mean": 0.234375,
+      "rewards/accuracy_reward/std": 0.42527204751968384,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999098777770996,
+      "sampling/importance_sampling_ratio/min": 0.026153141632676125,
+      "sampling/sampling_logp_difference/max": 3.6437859535217285,
+      "sampling/sampling_logp_difference/mean": 0.019532475620508194,
+      "step": 51
+    },
+    {
+      "clip_ratio/high_max": 1.6350510122720152e-05,
+      "clip_ratio/high_mean": 4.087627530680038e-06,
+      "clip_ratio/low_mean": 2.351988746340794e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.7607515221461654e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15668.0,
+      "completions/mean_length": 6073.8984375,
+      "completions/mean_terminated_length": 5992.71630859375,
+      "completions/min_length": 654.0,
+      "completions/min_terminated_length": 654.0,
+      "entropy": 1.0713753998279572,
+      "epoch": 0.04783808647654094,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.002212709980085492,
+      "learning_rate": 1e-05,
+      "loss": 0.0668,
+      "num_tokens": 38405196.0,
+      "reward": 0.359375,
+      "reward_std": 0.22119548916816711,
+      "rewards/accuracy_reward/mean": 0.359375,
+      "rewards/accuracy_reward/std": 0.481702595949173,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998978972434998,
+      "sampling/importance_sampling_ratio/min": 8.706459084351081e-06,
+      "sampling/sampling_logp_difference/max": 11.651445388793945,
+      "sampling/sampling_logp_difference/mean": 0.021252838894724846,
+      "step": 52
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 3.729486718384578e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.729486718384578e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15299.0,
+      "completions/mean_length": 5838.71875,
+      "completions/mean_terminated_length": 5671.33349609375,
+      "completions/min_length": 331.0,
+      "completions/min_terminated_length": 331.0,
+      "entropy": 1.021155133843422,
+      "epoch": 0.04875804967801288,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.001135052996687591,
+      "learning_rate": 1e-05,
+      "loss": 0.0178,
+      "num_tokens": 39171704.0,
+      "reward": 0.28125,
+      "reward_std": 0.23410367965698242,
+      "rewards/accuracy_reward/mean": 0.28125,
+      "rewards/accuracy_reward/std": 0.4513758420944214,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999173879623413,
+      "sampling/importance_sampling_ratio/min": 0.003084881929680705,
+      "sampling/sampling_logp_difference/max": 5.7812418937683105,
+      "sampling/sampling_logp_difference/mean": 0.020781882107257843,
+      "step": 53
+    },
+    {
+      "clip_ratio/high_max": 1.7124169744420215e-05,
+      "clip_ratio/high_mean": 4.281042436105054e-06,
+      "clip_ratio/low_mean": 3.706903294187214e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.135007543482061e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 14617.0,
+      "completions/max_terminated_length": 14617.0,
+      "completions/mean_length": 6358.5859375,
+      "completions/mean_terminated_length": 6358.5859375,
+      "completions/min_length": 940.0,
+      "completions/min_terminated_length": 940.0,
+      "entropy": 0.9720487147569656,
+      "epoch": 0.04967801287948482,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.002638082252815366,
+      "learning_rate": 1e-05,
+      "loss": 0.0145,
+      "num_tokens": 40003859.0,
+      "reward": 0.40625,
+      "reward_std": 0.3174618184566498,
+      "rewards/accuracy_reward/mean": 0.40625,
+      "rewards/accuracy_reward/std": 0.4930621087551117,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000380277633667,
+      "sampling/importance_sampling_ratio/min": 0.01960253342986107,
+      "sampling/sampling_logp_difference/max": 3.932096481323242,
+      "sampling/sampling_logp_difference/mean": 0.01991666667163372,
+      "step": 54
+    },
+    {
+      "clip_ratio/high_max": 6.55582925901399e-06,
+      "clip_ratio/high_mean": 2.994117721755174e-06,
+      "clip_ratio/low_mean": 2.222621503733535e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.5220332759090525e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 14753.0,
+      "completions/max_terminated_length": 14753.0,
+      "completions/mean_length": 4634.1875,
+      "completions/mean_terminated_length": 4634.1875,
+      "completions/min_length": 266.0,
+      "completions/min_terminated_length": 266.0,
+      "entropy": 0.9715309366583824,
+      "epoch": 0.050597976080956765,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.001994960242882371,
+      "learning_rate": 1e-05,
+      "loss": 0.0221,
+      "num_tokens": 40616483.0,
+      "reward": 0.4375,
+      "reward_std": 0.29644322395324707,
+      "rewards/accuracy_reward/mean": 0.4375,
+      "rewards/accuracy_reward/std": 0.49802759289741516,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000698566436768,
+      "sampling/importance_sampling_ratio/min": 1.0510009815334342e-05,
+      "sampling/sampling_logp_difference/max": 11.46318244934082,
+      "sampling/sampling_logp_difference/mean": 0.01902047172188759,
+      "step": 55
+    },
+    {
+      "clip_ratio/high_max": 2.2474248908110894e-05,
+      "clip_ratio/high_mean": 7.571314540655294e-06,
+      "clip_ratio/low_mean": 4.3583780325207044e-05,
+      "clip_ratio/low_min": 4.6013396968191955e-06,
+      "clip_ratio/region_mean": 5.1155094070054474e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15953.0,
+      "completions/mean_length": 6596.25,
+      "completions/mean_terminated_length": 6361.34423828125,
+      "completions/min_length": 318.0,
+      "completions/min_terminated_length": 318.0,
+      "entropy": 0.8207943215966225,
+      "epoch": 0.051517939282428704,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0019902780186384916,
+      "learning_rate": 1e-05,
+      "loss": 0.0506,
+      "num_tokens": 41484443.0,
+      "reward": 0.4453125,
+      "reward_std": 0.326668381690979,
+      "rewards/accuracy_reward/mean": 0.4453125,
+      "rewards/accuracy_reward/std": 0.4989531338214874,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.000016689300537,
+      "sampling/importance_sampling_ratio/min": 7.485233072657138e-05,
+      "sampling/sampling_logp_difference/max": 9.499993324279785,
+      "sampling/sampling_logp_difference/mean": 0.018301833420991898,
+      "step": 56
+    },
+    {
+      "clip_ratio/high_max": 3.0019932637515012e-06,
+      "clip_ratio/high_mean": 7.504983159378753e-07,
+      "clip_ratio/low_mean": 4.332785601945943e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.407835376696312e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15834.0,
+      "completions/mean_length": 6785.75,
+      "completions/mean_terminated_length": 6313.70458984375,
+      "completions/min_length": 393.0,
+      "completions/min_terminated_length": 393.0,
+      "entropy": 0.9876058474183083,
+      "epoch": 0.05243790248390064,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0015235114842653275,
+      "learning_rate": 1e-05,
+      "loss": 0.0128,
+      "num_tokens": 42372235.0,
+      "reward": 0.2421875,
+      "reward_std": 0.325075626373291,
+      "rewards/accuracy_reward/mean": 0.2421875,
+      "rewards/accuracy_reward/std": 0.4300905168056488,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999551773071289,
+      "sampling/importance_sampling_ratio/min": 0.026679370552301407,
+      "sampling/sampling_logp_difference/max": 3.6238646507263184,
+      "sampling/sampling_logp_difference/mean": 0.019945615902543068,
+      "step": 57
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 2.1349006601667497e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.1349006601667497e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14726.0,
+      "completions/mean_length": 4881.2109375,
+      "completions/mean_terminated_length": 4510.1533203125,
+      "completions/min_length": 437.0,
+      "completions/min_terminated_length": 437.0,
+      "entropy": 0.989942155778408,
+      "epoch": 0.05335786568537258,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.002033712575212121,
+      "learning_rate": 1e-05,
+      "loss": 0.1088,
+      "num_tokens": 43015238.0,
+      "reward": 0.4375,
+      "reward_std": 0.2869548797607422,
+      "rewards/accuracy_reward/mean": 0.4375,
+      "rewards/accuracy_reward/std": 0.49802759289741516,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000300407409668,
+      "sampling/importance_sampling_ratio/min": 0.0001238943514181301,
+      "sampling/sampling_logp_difference/max": 8.996081352233887,
+      "sampling/sampling_logp_difference/mean": 0.01887543685734272,
+      "step": 58
+    },
+    {
+      "clip_ratio/high_max": 2.584004687378183e-05,
+      "clip_ratio/high_mean": 6.4600117184454575e-06,
+      "clip_ratio/low_mean": 2.1371045761497953e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.7831058105221018e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15001.0,
+      "completions/max_terminated_length": 15001.0,
+      "completions/mean_length": 4725.3984375,
+      "completions/mean_terminated_length": 4725.3984375,
+      "completions/min_length": 117.0,
+      "completions/min_terminated_length": 117.0,
+      "entropy": 1.0350637435913086,
+      "epoch": 0.05427782888684453,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0030296226032078266,
+      "learning_rate": 1e-05,
+      "loss": 0.0691,
+      "num_tokens": 43637737.0,
+      "reward": 0.4453125,
+      "reward_std": 0.32035762071609497,
+      "rewards/accuracy_reward/mean": 0.4453125,
+      "rewards/accuracy_reward/std": 0.4989531338214874,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999939203262329,
+      "sampling/importance_sampling_ratio/min": 0.00022932067804504186,
+      "sampling/sampling_logp_difference/max": 8.380389213562012,
+      "sampling/sampling_logp_difference/mean": 0.01995944231748581,
+      "step": 59
+    },
+    {
+      "clip_ratio/high_max": 1.994733975152485e-05,
+      "clip_ratio/high_mean": 4.986834937881213e-06,
+      "clip_ratio/low_mean": 3.5168303838872816e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.015513832200668e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16240.0,
+      "completions/mean_length": 4918.171875,
+      "completions/mean_terminated_length": 4736.1748046875,
+      "completions/min_length": 388.0,
+      "completions/min_terminated_length": 388.0,
+      "entropy": 0.965274304151535,
+      "epoch": 0.05519779208831647,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.002758471528068185,
+      "learning_rate": 1e-05,
+      "loss": 0.0845,
+      "num_tokens": 44285327.0,
+      "reward": 0.328125,
+      "reward_std": 0.27328526973724365,
+      "rewards/accuracy_reward/mean": 0.328125,
+      "rewards/accuracy_reward/std": 0.4713755249977112,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999663233757019,
+      "sampling/importance_sampling_ratio/min": 0.010958661325275898,
+      "sampling/sampling_logp_difference/max": 4.513625144958496,
+      "sampling/sampling_logp_difference/mean": 0.019083233550190926,
+      "step": 60
+    },
+    {
+      "clip_ratio/high_max": 1.0621563887980301e-05,
+      "clip_ratio/high_mean": 2.6553909719950752e-06,
+      "clip_ratio/low_mean": 3.838553107016196e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.1040922042157035e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15031.0,
+      "completions/mean_length": 4998.2890625,
+      "completions/mean_terminated_length": 4908.6376953125,
+      "completions/min_length": 524.0,
+      "completions/min_terminated_length": 524.0,
+      "entropy": 0.9200445115566254,
+      "epoch": 0.05611775528978841,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.0027611786499619484,
+      "learning_rate": 1e-05,
+      "loss": 0.0575,
+      "num_tokens": 44944356.0,
+      "reward": 0.3515625,
+      "reward_std": 0.3895368278026581,
+      "rewards/accuracy_reward/mean": 0.3515625,
+      "rewards/accuracy_reward/std": 0.4793342351913452,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999884366989136,
+      "sampling/importance_sampling_ratio/min": 0.0018651526188477874,
+      "sampling/sampling_logp_difference/max": 6.284412384033203,
+      "sampling/sampling_logp_difference/mean": 0.017853498458862305,
+      "step": 61
+    },
+    {
+      "clip_ratio/high_max": 1.0136624496226432e-05,
+      "clip_ratio/high_mean": 2.534156124056608e-06,
+      "clip_ratio/low_mean": 2.0260404085092887e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.2794560095462657e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16110.0,
+      "completions/mean_length": 6290.1796875,
+      "completions/mean_terminated_length": 6129.96044921875,
+      "completions/min_length": 302.0,
+      "completions/min_terminated_length": 302.0,
+      "entropy": 0.9360214695334435,
+      "epoch": 0.05703771849126035,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0015557854203507304,
+      "learning_rate": 1e-05,
+      "loss": 0.0111,
+      "num_tokens": 45767867.0,
+      "reward": 0.34375,
+      "reward_std": 0.30168038606643677,
+      "rewards/accuracy_reward/mean": 0.34375,
+      "rewards/accuracy_reward/std": 0.47682511806488037,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999427795410156,
+      "sampling/importance_sampling_ratio/min": 0.0011004531988874078,
+      "sampling/sampling_logp_difference/max": 6.812033176422119,
+      "sampling/sampling_logp_difference/mean": 0.0200855303555727,
+      "step": 62
+    },
+    {
+      "clip_ratio/high_max": 2.2559511307918e-06,
+      "clip_ratio/high_mean": 5.6398778269795e-07,
+      "clip_ratio/low_mean": 4.51761221711422e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.574010984015331e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16366.0,
+      "completions/mean_length": 6486.15625,
+      "completions/mean_terminated_length": 6248.6083984375,
+      "completions/min_length": 77.0,
+      "completions/min_terminated_length": 77.0,
+      "entropy": 0.863138921558857,
+      "epoch": 0.05795768169273229,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0026953541673719883,
+      "learning_rate": 1e-05,
+      "loss": -0.0194,
+      "num_tokens": 46618575.0,
+      "reward": 0.2578125,
+      "reward_std": 0.2580180764198303,
+      "rewards/accuracy_reward/mean": 0.2578125,
+      "rewards/accuracy_reward/std": 0.43914902210235596,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999406337738037,
+      "sampling/importance_sampling_ratio/min": 0.0011708897072821856,
+      "sampling/sampling_logp_difference/max": 6.749991416931152,
+      "sampling/sampling_logp_difference/mean": 0.01863238587975502,
+      "step": 63
+    },
+    {
+      "clip_ratio/high_max": 1.0073357771034352e-05,
+      "clip_ratio/high_mean": 2.518339442758588e-06,
+      "clip_ratio/low_mean": 2.787370635815023e-05,
+      "clip_ratio/low_min": 3.837534222839167e-06,
+      "clip_ratio/region_mean": 3.0392045573535142e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16010.0,
+      "completions/mean_length": 6442.7734375,
+      "completions/mean_terminated_length": 6284.9765625,
+      "completions/min_length": 776.0,
+      "completions/min_terminated_length": 776.0,
+      "entropy": 1.0242054909467697,
+      "epoch": 0.05887764489420423,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0024442619178444147,
+      "learning_rate": 1e-05,
+      "loss": 0.0569,
+      "num_tokens": 47462274.0,
+      "reward": 0.328125,
+      "reward_std": 0.2777610421180725,
+      "rewards/accuracy_reward/mean": 0.328125,
+      "rewards/accuracy_reward/std": 0.4713755249977112,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998892545700073,
+      "sampling/importance_sampling_ratio/min": 4.9445447736218284e-09,
+      "sampling/sampling_logp_difference/max": 19.124980926513672,
+      "sampling/sampling_logp_difference/mean": 0.019810764119029045,
+      "step": 64
+    },
+    {
+      "clip_ratio/high_max": 1.220810372615233e-05,
+      "clip_ratio/high_mean": 3.0520259315380827e-06,
+      "clip_ratio/low_mean": 4.339240456374682e-05,
+      "clip_ratio/low_min": 4.491233084991109e-06,
+      "clip_ratio/region_mean": 4.644443038159807e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15624.0,
+      "completions/mean_length": 4807.765625,
+      "completions/mean_terminated_length": 4716.6142578125,
+      "completions/min_length": 272.0,
+      "completions/min_terminated_length": 272.0,
+      "entropy": 1.045751042664051,
+      "epoch": 0.05979760809567617,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.002512057079002261,
+      "learning_rate": 1e-05,
+      "loss": 0.003,
+      "num_tokens": 48096692.0,
+      "reward": 0.3671875,
+      "reward_std": 0.3435155153274536,
+      "rewards/accuracy_reward/mean": 0.3671875,
+      "rewards/accuracy_reward/std": 0.4839322865009308,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999058842658997,
+      "sampling/importance_sampling_ratio/min": 1.1480136890895665e-05,
+      "sampling/sampling_logp_difference/max": 11.374892234802246,
+      "sampling/sampling_logp_difference/mean": 0.01960371434688568,
+      "step": 65
+    },
+    {
+      "clip_ratio/high_max": 5.37941218681226e-06,
+      "clip_ratio/high_mean": 1.344853046703065e-06,
+      "clip_ratio/low_mean": 3.0161771633174794e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.1506624850408116e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16264.0,
+      "completions/mean_length": 6703.8359375,
+      "completions/mean_terminated_length": 6471.51220703125,
+      "completions/min_length": 813.0,
+      "completions/min_terminated_length": 813.0,
+      "entropy": 1.0592866837978363,
+      "epoch": 0.06071757129714812,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0016389708034694195,
+      "learning_rate": 1e-05,
+      "loss": -0.024,
+      "num_tokens": 48974399.0,
+      "reward": 0.2734375,
+      "reward_std": 0.2585548758506775,
+      "rewards/accuracy_reward/mean": 0.2734375,
+      "rewards/accuracy_reward/std": 0.447474867105484,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999353885650635,
+      "sampling/importance_sampling_ratio/min": 7.4113349910476245e-06,
+      "sampling/sampling_logp_difference/max": 11.8125,
+      "sampling/sampling_logp_difference/mean": 0.020880095660686493,
+      "step": 66
+    },
+    {
+      "clip_ratio/high_max": 7.093600515872822e-06,
+      "clip_ratio/high_mean": 1.7734001289682055e-06,
+      "clip_ratio/low_mean": 4.470584758564655e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.647924811251869e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16295.0,
+      "completions/mean_length": 6140.5078125,
+      "completions/mean_terminated_length": 5724.10546875,
+      "completions/min_length": 462.0,
+      "completions/min_terminated_length": 462.0,
+      "entropy": 1.0998501181602478,
+      "epoch": 0.061637534498620056,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.003946912474930286,
+      "learning_rate": 1e-05,
+      "loss": 0.0448,
+      "num_tokens": 49779920.0,
+      "reward": 0.34375,
+      "reward_std": 0.36796674132347107,
+      "rewards/accuracy_reward/mean": 0.34375,
+      "rewards/accuracy_reward/std": 0.47682511806488037,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999687671661377,
+      "sampling/importance_sampling_ratio/min": 2.849436668839189e-07,
+      "sampling/sampling_logp_difference/max": 15.070974349975586,
+      "sampling/sampling_logp_difference/mean": 0.021355850622057915,
+      "step": 67
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 3.313956779038563e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.313956779038563e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16352.0,
+      "completions/mean_length": 6689.8046875,
+      "completions/mean_terminated_length": 6213.04052734375,
+      "completions/min_length": 422.0,
+      "completions/min_terminated_length": 422.0,
+      "entropy": 0.8561654165387154,
+      "epoch": 0.062557497700092,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.0021656695753335953,
+      "learning_rate": 1e-05,
+      "loss": 0.0283,
+      "num_tokens": 50655023.0,
+      "reward": 0.203125,
+      "reward_std": 0.21723884344100952,
+      "rewards/accuracy_reward/mean": 0.203125,
+      "rewards/accuracy_reward/std": 0.40390563011169434,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999941885471344,
+      "sampling/importance_sampling_ratio/min": 2.836359499269747e-06,
+      "sampling/sampling_logp_difference/max": 12.772989273071289,
+      "sampling/sampling_logp_difference/mean": 0.01873670145869255,
+      "step": 68
+    },
+    {
+      "clip_ratio/high_max": 2.3421607693308033e-05,
+      "clip_ratio/high_mean": 7.242933975248889e-06,
+      "clip_ratio/low_mean": 3.896083626386826e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.620377103492501e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 14330.0,
+      "completions/max_terminated_length": 14330.0,
+      "completions/mean_length": 5707.0078125,
+      "completions/mean_terminated_length": 5707.0078125,
+      "completions/min_length": 625.0,
+      "completions/min_terminated_length": 625.0,
+      "entropy": 1.1396166533231735,
+      "epoch": 0.06347746090156393,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.004121148493140936,
+      "learning_rate": 1e-05,
+      "loss": 0.0397,
+      "num_tokens": 51406536.0,
+      "reward": 0.3125,
+      "reward_std": 0.3066929578781128,
+      "rewards/accuracy_reward/mean": 0.3125,
+      "rewards/accuracy_reward/std": 0.4653336703777313,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999328851699829,
+      "sampling/importance_sampling_ratio/min": 0.0005196487763896585,
+      "sampling/sampling_logp_difference/max": 7.562357425689697,
+      "sampling/sampling_logp_difference/mean": 0.020000409334897995,
+      "step": 69
+    },
+    {
+      "clip_ratio/high_max": 1.82290532393381e-05,
+      "clip_ratio/high_mean": 4.557263309834525e-06,
+      "clip_ratio/low_mean": 2.5275351731579576e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.9832615496161452e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15987.0,
+      "completions/mean_length": 5655.6328125,
+      "completions/mean_terminated_length": 5571.1572265625,
+      "completions/min_length": 157.0,
+      "completions/min_terminated_length": 157.0,
+      "entropy": 0.8928132206201553,
+      "epoch": 0.06439742410303588,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0032538517843931913,
+      "learning_rate": 1e-05,
+      "loss": 0.0627,
+      "num_tokens": 52148473.0,
+      "reward": 0.3984375,
+      "reward_std": 0.29432642459869385,
+      "rewards/accuracy_reward/mean": 0.3984375,
+      "rewards/accuracy_reward/std": 0.4915000796318054,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000033378601074,
+      "sampling/importance_sampling_ratio/min": 0.0017573959194123745,
+      "sampling/sampling_logp_difference/max": 6.343922138214111,
+      "sampling/sampling_logp_difference/mean": 0.018881790339946747,
+      "step": 70
+    },
+    {
+      "clip_ratio/high_max": 1.2836022506235167e-05,
+      "clip_ratio/high_mean": 3.209005626558792e-06,
+      "clip_ratio/low_mean": 3.8109637216621195e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.131864307055366e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16323.0,
+      "completions/mean_length": 7399.7890625,
+      "completions/mean_terminated_length": 7034.5771484375,
+      "completions/min_length": 144.0,
+      "completions/min_terminated_length": 144.0,
+      "entropy": 0.8808257132768631,
+      "epoch": 0.06531738730450783,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.002061733277514577,
+      "learning_rate": 1e-05,
+      "loss": 0.0191,
+      "num_tokens": 53113230.0,
+      "reward": 0.3046875,
+      "reward_std": 0.2580180764198303,
+      "rewards/accuracy_reward/mean": 0.3046875,
+      "rewards/accuracy_reward/std": 0.46208351850509644,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999673962593079,
+      "sampling/importance_sampling_ratio/min": 0.005283349193632603,
+      "sampling/sampling_logp_difference/max": 5.243195056915283,
+      "sampling/sampling_logp_difference/mean": 0.018456293269991875,
+      "step": 71
+    },
+    {
+      "clip_ratio/high_max": 1.5806871488166507e-05,
+      "clip_ratio/high_mean": 4.739466817227367e-06,
+      "clip_ratio/low_mean": 3.610486896832299e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.084433521711617e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16208.0,
+      "completions/mean_length": 5730.9609375,
+      "completions/mean_terminated_length": 5475.2880859375,
+      "completions/min_length": 433.0,
+      "completions/min_terminated_length": 433.0,
+      "entropy": 0.9486126750707626,
+      "epoch": 0.06623735050597976,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0012298432411625981,
+      "learning_rate": 1e-05,
+      "loss": 0.0208,
+      "num_tokens": 53864049.0,
+      "reward": 0.359375,
+      "reward_std": 0.25460314750671387,
+      "rewards/accuracy_reward/mean": 0.359375,
+      "rewards/accuracy_reward/std": 0.481702595949173,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999348521232605,
+      "sampling/importance_sampling_ratio/min": 4.832820559386164e-05,
+      "sampling/sampling_logp_difference/max": 9.937495231628418,
+      "sampling/sampling_logp_difference/mean": 0.01919996738433838,
+      "step": 72
+    },
+    {
+      "clip_ratio/high_max": 1.2390134997986024e-05,
+      "clip_ratio/high_mean": 3.097533749496506e-06,
+      "clip_ratio/low_mean": 3.8867822581778455e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.19653564449618e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 13500.0,
+      "completions/mean_length": 4620.5703125,
+      "completions/mean_terminated_length": 4527.94482421875,
+      "completions/min_length": 364.0,
+      "completions/min_terminated_length": 364.0,
+      "entropy": 0.9557560831308365,
+      "epoch": 0.0671573137074517,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.002882040338590741,
+      "learning_rate": 1e-05,
+      "loss": 0.0609,
+      "num_tokens": 54473498.0,
+      "reward": 0.3984375,
+      "reward_std": 0.39294686913490295,
+      "rewards/accuracy_reward/mean": 0.3984375,
+      "rewards/accuracy_reward/std": 0.4915000796318054,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998915195465088,
+      "sampling/importance_sampling_ratio/min": 1.577107298089686e-07,
+      "sampling/sampling_logp_difference/max": 15.662503242492676,
+      "sampling/sampling_logp_difference/mean": 0.018525000661611557,
+      "step": 73
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 3.088819471486204e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.088819471486204e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 16314.0,
+      "completions/max_terminated_length": 16314.0,
+      "completions/mean_length": 5074.0703125,
+      "completions/mean_terminated_length": 5074.0703125,
+      "completions/min_length": 342.0,
+      "completions/min_terminated_length": 342.0,
+      "entropy": 0.8830869868397713,
+      "epoch": 0.06807727690892364,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.003324020653963089,
+      "learning_rate": 1e-05,
+      "loss": 0.0305,
+      "num_tokens": 55141787.0,
+      "reward": 0.4609375,
+      "reward_std": 0.30115634202957153,
+      "rewards/accuracy_reward/mean": 0.4609375,
+      "rewards/accuracy_reward/std": 0.5004304051399231,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999203681945801,
+      "sampling/importance_sampling_ratio/min": 0.0009876838885247707,
+      "sampling/sampling_logp_difference/max": 6.920147895812988,
+      "sampling/sampling_logp_difference/mean": 0.018072880804538727,
+      "step": 74
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 1.526649884908693e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 1.526649884908693e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15251.0,
+      "completions/max_terminated_length": 15251.0,
+      "completions/mean_length": 6192.1015625,
+      "completions/mean_terminated_length": 6192.1015625,
+      "completions/min_length": 553.0,
+      "completions/min_terminated_length": 553.0,
+      "entropy": 1.0888547226786613,
+      "epoch": 0.06899724011039558,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0017452294705435634,
+      "learning_rate": 1e-05,
+      "loss": 0.0216,
+      "num_tokens": 55954144.0,
+      "reward": 0.2890625,
+      "reward_std": 0.23250606656074524,
+      "rewards/accuracy_reward/mean": 0.2890625,
+      "rewards/accuracy_reward/std": 0.45510825514793396,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999473690986633,
+      "sampling/importance_sampling_ratio/min": 5.061922365712235e-07,
+      "sampling/sampling_logp_difference/max": 14.496349334716797,
+      "sampling/sampling_logp_difference/mean": 0.021221645176410675,
+      "step": 75
+    },
+    {
+      "clip_ratio/high_max": 1.6768677141953958e-05,
+      "clip_ratio/high_mean": 5.080836899651331e-06,
+      "clip_ratio/low_mean": 3.340929970363504e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.84901372854074e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15740.0,
+      "completions/mean_length": 6204.296875,
+      "completions/mean_terminated_length": 6124.1416015625,
+      "completions/min_length": 294.0,
+      "completions/min_terminated_length": 294.0,
+      "entropy": 1.0423575639724731,
+      "epoch": 0.06991720331186753,
+      "frac_reward_zero_std": 0.125,
+      "grad_norm": 0.0033357341308146715,
+      "learning_rate": 1e-05,
+      "loss": 0.1073,
+      "num_tokens": 56765470.0,
+      "reward": 0.3359375,
+      "reward_std": 0.37875816226005554,
+      "rewards/accuracy_reward/mean": 0.3359375,
+      "rewards/accuracy_reward/std": 0.47417303919792175,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.99998539686203,
+      "sampling/importance_sampling_ratio/min": 4.564182381727733e-05,
+      "sampling/sampling_logp_difference/max": 9.994686126708984,
+      "sampling/sampling_logp_difference/mean": 0.01908688060939312,
+      "step": 76
+    },
+    {
+      "clip_ratio/high_max": 3.149884150843718e-06,
+      "clip_ratio/high_mean": 7.874710377109295e-07,
+      "clip_ratio/low_mean": 2.430614893000893e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.509361991087644e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 14409.0,
+      "completions/max_terminated_length": 14409.0,
+      "completions/mean_length": 5070.3125,
+      "completions/mean_terminated_length": 5070.3125,
+      "completions/min_length": 629.0,
+      "completions/min_terminated_length": 629.0,
+      "entropy": 1.0737399458885193,
+      "epoch": 0.07083716651333946,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0038695367984473705,
+      "learning_rate": 1e-05,
+      "loss": 0.0015,
+      "num_tokens": 57432958.0,
+      "reward": 0.390625,
+      "reward_std": 0.22119548916816711,
+      "rewards/accuracy_reward/mean": 0.390625,
+      "rewards/accuracy_reward/std": 0.4898075461387634,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999223947525024,
+      "sampling/importance_sampling_ratio/min": 1.5509348259001854e-06,
+      "sampling/sampling_logp_difference/max": 13.376652717590332,
+      "sampling/sampling_logp_difference/mean": 0.01970684342086315,
+      "step": 77
+    },
+    {
+      "clip_ratio/high_max": 1.9821940441033803e-05,
+      "clip_ratio/high_mean": 4.955485110258451e-06,
+      "clip_ratio/low_mean": 2.9055729555693688e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.401121466595214e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15799.0,
+      "completions/mean_length": 5750.21875,
+      "completions/mean_terminated_length": 5495.00830078125,
+      "completions/min_length": 76.0,
+      "completions/min_terminated_length": 76.0,
+      "entropy": 0.9708107560873032,
+      "epoch": 0.07175712971481141,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.002927646040916443,
+      "learning_rate": 1e-05,
+      "loss": 0.0166,
+      "num_tokens": 58187426.0,
+      "reward": 0.296875,
+      "reward_std": 0.25460314750671387,
+      "rewards/accuracy_reward/mean": 0.296875,
+      "rewards/accuracy_reward/std": 0.45867621898651123,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999390840530396,
+      "sampling/importance_sampling_ratio/min": 0.015204614959657192,
+      "sampling/sampling_logp_difference/max": 4.186156272888184,
+      "sampling/sampling_logp_difference/mean": 0.019483914598822594,
+      "step": 78
+    },
+    {
+      "clip_ratio/high_max": 2.3815636723156786e-05,
+      "clip_ratio/high_mean": 5.953909180789196e-06,
+      "clip_ratio/low_mean": 4.989707144886779e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.585097960647545e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15938.0,
+      "completions/mean_length": 6067.484375,
+      "completions/mean_terminated_length": 5986.251953125,
+      "completions/min_length": 656.0,
+      "completions/min_terminated_length": 656.0,
+      "entropy": 0.9576351121068001,
+      "epoch": 0.07267709291628335,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.0026169484481215477,
+      "learning_rate": 1e-05,
+      "loss": -0.0055,
+      "num_tokens": 58983336.0,
+      "reward": 0.390625,
+      "reward_std": 0.3406373858451843,
+      "rewards/accuracy_reward/mean": 0.390625,
+      "rewards/accuracy_reward/std": 0.4898075461387634,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999620914459229,
+      "sampling/importance_sampling_ratio/min": 1.974713995878119e-06,
+      "sampling/sampling_logp_difference/max": 13.135087013244629,
+      "sampling/sampling_logp_difference/mean": 0.019007554277777672,
+      "step": 79
+    },
+    {
+      "clip_ratio/high_max": 2.4238934656750644e-05,
+      "clip_ratio/high_mean": 7.786730066072778e-06,
+      "clip_ratio/low_mean": 4.5700241571466904e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.3486972547034384e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 13640.0,
+      "completions/max_terminated_length": 13640.0,
+      "completions/mean_length": 4612.8984375,
+      "completions/mean_terminated_length": 4612.8984375,
+      "completions/min_length": 198.0,
+      "completions/min_terminated_length": 198.0,
+      "entropy": 0.9636320173740387,
+      "epoch": 0.07359705611775529,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0015429699560627341,
+      "learning_rate": 1e-05,
+      "loss": -0.018,
+      "num_tokens": 59590763.0,
+      "reward": 0.421875,
+      "reward_std": 0.34139877557754517,
+      "rewards/accuracy_reward/mean": 0.421875,
+      "rewards/accuracy_reward/std": 0.4957992732524872,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999473094940186,
+      "sampling/importance_sampling_ratio/min": 2.5909587364481013e-08,
+      "sampling/sampling_logp_difference/max": 17.468652725219727,
+      "sampling/sampling_logp_difference/mean": 0.019313856959342957,
+      "step": 80
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 3.0911465842109465e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.0911465842109465e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16300.0,
+      "completions/mean_length": 6101.3125,
+      "completions/mean_terminated_length": 5854.5283203125,
+      "completions/min_length": 179.0,
+      "completions/min_terminated_length": 179.0,
+      "entropy": 0.8831139355897903,
+      "epoch": 0.07451701931922723,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0022505265660583973,
+      "learning_rate": 1e-05,
+      "loss": 0.0813,
+      "num_tokens": 60391283.0,
+      "reward": 0.3125,
+      "reward_std": 0.29302334785461426,
+      "rewards/accuracy_reward/mean": 0.3125,
+      "rewards/accuracy_reward/std": 0.4653336703777313,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000061988830566,
+      "sampling/importance_sampling_ratio/min": 0.0003816343960352242,
+      "sampling/sampling_logp_difference/max": 7.871047496795654,
+      "sampling/sampling_logp_difference/mean": 0.018377842381596565,
+      "step": 81
+    },
+    {
+      "clip_ratio/high_max": 1.547606643725885e-05,
+      "clip_ratio/high_mean": 3.869016609314713e-06,
+      "clip_ratio/low_mean": 2.478705800967873e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.8656074391619768e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14862.0,
+      "completions/mean_length": 4705.9921875,
+      "completions/mean_terminated_length": 4614.03955078125,
+      "completions/min_length": 86.0,
+      "completions/min_terminated_length": 86.0,
+      "entropy": 0.9557913094758987,
+      "epoch": 0.07543698252069918,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.002069958718493581,
+      "learning_rate": 1e-05,
+      "loss": -0.0015,
+      "num_tokens": 61021490.0,
+      "reward": 0.4296875,
+      "reward_std": 0.2637920379638672,
+      "rewards/accuracy_reward/mean": 0.4296875,
+      "rewards/accuracy_reward/std": 0.4969765841960907,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999030232429504,
+      "sampling/importance_sampling_ratio/min": 2.76673017651774e-05,
+      "sampling/sampling_logp_difference/max": 10.495259284973145,
+      "sampling/sampling_logp_difference/mean": 0.018629569560289383,
+      "step": 82
+    },
+    {
+      "clip_ratio/high_max": 2.0910484636260662e-05,
+      "clip_ratio/high_mean": 5.2276211590651656e-06,
+      "clip_ratio/low_mean": 1.952954164607945e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.4757162805144617e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 13745.0,
+      "completions/max_terminated_length": 13745.0,
+      "completions/mean_length": 5116.78125,
+      "completions/mean_terminated_length": 5116.78125,
+      "completions/min_length": 12.0,
+      "completions/min_terminated_length": 12.0,
+      "entropy": 1.0198405236005783,
+      "epoch": 0.07635694572217111,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0034461067989468575,
+      "learning_rate": 1e-05,
+      "loss": -0.0073,
+      "num_tokens": 61695382.0,
+      "reward": 0.265625,
+      "reward_std": 0.30774885416030884,
+      "rewards/accuracy_reward/mean": 0.265625,
+      "rewards/accuracy_reward/std": 0.44340085983276367,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999936819076538,
+      "sampling/importance_sampling_ratio/min": 0.012227212078869343,
+      "sampling/sampling_logp_difference/max": 4.4040913581848145,
+      "sampling/sampling_logp_difference/mean": 0.019400250166654587,
+      "step": 83
+    },
+    {
+      "clip_ratio/high_max": 1.5340228401328204e-05,
+      "clip_ratio/high_mean": 3.835057100332051e-06,
+      "clip_ratio/low_mean": 3.150914017169271e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.534419727202476e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15953.0,
+      "completions/mean_length": 5891.9140625,
+      "completions/mean_terminated_length": 5553.45947265625,
+      "completions/min_length": 79.0,
+      "completions/min_terminated_length": 79.0,
+      "entropy": 0.9568078517913818,
+      "epoch": 0.07727690892364306,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0025854657869786024,
+      "learning_rate": 1e-05,
+      "loss": 0.1013,
+      "num_tokens": 62474883.0,
+      "reward": 0.3203125,
+      "reward_std": 0.31010788679122925,
+      "rewards/accuracy_reward/mean": 0.3203125,
+      "rewards/accuracy_reward/std": 0.4684300124645233,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0001013278961182,
+      "sampling/importance_sampling_ratio/min": 0.0015072470996528864,
+      "sampling/sampling_logp_difference/max": 6.497470378875732,
+      "sampling/sampling_logp_difference/mean": 0.019574139267206192,
+      "step": 84
+    },
+    {
+      "clip_ratio/high_max": 1.108303422370227e-05,
+      "clip_ratio/high_mean": 2.7707585559255676e-06,
+      "clip_ratio/low_mean": 2.2325777763398946e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.5096536319324514e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 13671.0,
+      "completions/mean_length": 5300.3359375,
+      "completions/mean_terminated_length": 5213.06298828125,
+      "completions/min_length": 17.0,
+      "completions/min_terminated_length": 17.0,
+      "entropy": 0.9722280204296112,
+      "epoch": 0.078196872125115,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0025075653102248907,
+      "learning_rate": 1e-05,
+      "loss": 0.0312,
+      "num_tokens": 63172454.0,
+      "reward": 0.203125,
+      "reward_std": 0.21542152762413025,
+      "rewards/accuracy_reward/mean": 0.203125,
+      "rewards/accuracy_reward/std": 0.40390563011169434,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999643564224243,
+      "sampling/importance_sampling_ratio/min": 0.00020346972451079637,
+      "sampling/sampling_logp_difference/max": 8.499993324279785,
+      "sampling/sampling_logp_difference/mean": 0.02002432942390442,
+      "step": 85
+    },
+    {
+      "clip_ratio/high_max": 1.3991947980684927e-05,
+      "clip_ratio/high_mean": 3.4979869951712317e-06,
+      "clip_ratio/low_mean": 4.893367201930232e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.243165958290774e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15617.0,
+      "completions/mean_length": 6364.21875,
+      "completions/mean_terminated_length": 6205.1748046875,
+      "completions/min_length": 215.0,
+      "completions/min_terminated_length": 215.0,
+      "entropy": 1.0607495978474617,
+      "epoch": 0.07911683532658693,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0017982006538659334,
+      "learning_rate": 1e-05,
+      "loss": -0.0117,
+      "num_tokens": 64007602.0,
+      "reward": 0.2890625,
+      "reward_std": 0.21778053045272827,
+      "rewards/accuracy_reward/mean": 0.2890625,
+      "rewards/accuracy_reward/std": 0.45510825514793396,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000064373016357,
+      "sampling/importance_sampling_ratio/min": 3.823801307589747e-05,
+      "sampling/sampling_logp_difference/max": 10.171680450439453,
+      "sampling/sampling_logp_difference/mean": 0.020373597741127014,
+      "step": 86
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 2.6416430046083406e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.6416430046083406e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14709.0,
+      "completions/mean_length": 5746.3125,
+      "completions/mean_terminated_length": 5403.1611328125,
+      "completions/min_length": 287.0,
+      "completions/min_terminated_length": 287.0,
+      "entropy": 0.9913106113672256,
+      "epoch": 0.08003679852805888,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.002207317156717181,
+      "learning_rate": 1e-05,
+      "loss": 0.063,
+      "num_tokens": 64762058.0,
+      "reward": 0.34375,
+      "reward_std": 0.3264310359954834,
+      "rewards/accuracy_reward/mean": 0.34375,
+      "rewards/accuracy_reward/std": 0.47682511806488037,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999239444732666,
+      "sampling/importance_sampling_ratio/min": 5.3444750847120304e-08,
+      "sampling/sampling_logp_difference/max": 16.744617462158203,
+      "sampling/sampling_logp_difference/mean": 0.020608089864253998,
+      "step": 87
+    },
+    {
+      "clip_ratio/high_max": 1.2681661701208213e-05,
+      "clip_ratio/high_mean": 3.1704154253020533e-06,
+      "clip_ratio/low_mean": 3.541917828897567e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.85895939416514e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16292.0,
+      "completions/mean_length": 6088.5625,
+      "completions/mean_terminated_length": 5841.47216796875,
+      "completions/min_length": 159.0,
+      "completions/min_terminated_length": 159.0,
+      "entropy": 0.9040444120764732,
+      "epoch": 0.08095676172953081,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0012974507408216596,
+      "learning_rate": 1e-05,
+      "loss": 0.0401,
+      "num_tokens": 65561002.0,
+      "reward": 0.3671875,
+      "reward_std": 0.2477683573961258,
+      "rewards/accuracy_reward/mean": 0.3671875,
+      "rewards/accuracy_reward/std": 0.4839322865009308,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998487234115601,
+      "sampling/importance_sampling_ratio/min": 6.021501121722395e-06,
+      "sampling/sampling_logp_difference/max": 12.020174026489258,
+      "sampling/sampling_logp_difference/mean": 0.01939838007092476,
+      "step": 88
+    },
+    {
+      "clip_ratio/high_max": 7.807132533343975e-06,
+      "clip_ratio/high_mean": 1.9517831333359936e-06,
+      "clip_ratio/low_mean": 1.8564539345788944e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.05163223654381e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15021.0,
+      "completions/mean_length": 5765.5,
+      "completions/mean_terminated_length": 5510.65625,
+      "completions/min_length": 412.0,
+      "completions/min_terminated_length": 412.0,
+      "entropy": 0.9966336265206337,
+      "epoch": 0.08187672493100276,
+      "frac_reward_zero_std": 0.6875,
+      "grad_norm": 0.0013380619930103421,
+      "learning_rate": 1e-05,
+      "loss": 0.0522,
+      "num_tokens": 66318482.0,
+      "reward": 0.375,
+      "reward_std": 0.13994136452674866,
+      "rewards/accuracy_reward/mean": 0.375,
+      "rewards/accuracy_reward/std": 0.4860251843929291,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999471306800842,
+      "sampling/importance_sampling_ratio/min": 7.288413598871557e-06,
+      "sampling/sampling_logp_difference/max": 11.829224586486816,
+      "sampling/sampling_logp_difference/mean": 0.018109245225787163,
+      "step": 89
+    },
+    {
+      "clip_ratio/high_max": 1.7906912489706883e-05,
+      "clip_ratio/high_mean": 4.476728122426721e-06,
+      "clip_ratio/low_mean": 2.5812531305291486e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.0289259655091882e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16120.0,
+      "completions/mean_length": 5462.78125,
+      "completions/mean_terminated_length": 5200.67236328125,
+      "completions/min_length": 460.0,
+      "completions/min_terminated_length": 460.0,
+      "entropy": 0.9345141425728798,
+      "epoch": 0.0827966881324747,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0023930128663778305,
+      "learning_rate": 1e-05,
+      "loss": 0.0475,
+      "num_tokens": 67038582.0,
+      "reward": 0.46875,
+      "reward_std": 0.24435341358184814,
+      "rewards/accuracy_reward/mean": 0.46875,
+      "rewards/accuracy_reward/std": 0.5009832978248596,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999513030052185,
+      "sampling/importance_sampling_ratio/min": 0.008508839644491673,
+      "sampling/sampling_logp_difference/max": 4.7666497230529785,
+      "sampling/sampling_logp_difference/mean": 0.019220296293497086,
+      "step": 90
+    },
+    {
+      "clip_ratio/high_max": 1.551389118503721e-05,
+      "clip_ratio/high_mean": 3.878472796259302e-06,
+      "clip_ratio/low_mean": 3.239646628117043e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.6274939645863924e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15034.0,
+      "completions/max_terminated_length": 15034.0,
+      "completions/mean_length": 5547.5078125,
+      "completions/mean_terminated_length": 5547.5078125,
+      "completions/min_length": 14.0,
+      "completions/min_terminated_length": 14.0,
+      "entropy": 1.0511749312281609,
+      "epoch": 0.08371665133394664,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.0013633714988827705,
+      "learning_rate": 1e-05,
+      "loss": 0.0462,
+      "num_tokens": 67774487.0,
+      "reward": 0.203125,
+      "reward_std": 0.19438527524471283,
+      "rewards/accuracy_reward/mean": 0.203125,
+      "rewards/accuracy_reward/std": 0.40390563011169434,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999545216560364,
+      "sampling/importance_sampling_ratio/min": 1.0995515367540065e-05,
+      "sampling/sampling_logp_difference/max": 11.418023109436035,
+      "sampling/sampling_logp_difference/mean": 0.020328814163804054,
+      "step": 91
+    },
+    {
+      "clip_ratio/high_max": 1.5384989410449634e-05,
+      "clip_ratio/high_mean": 3.846247352612409e-06,
+      "clip_ratio/low_mean": 3.441604167164769e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.826228908110352e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14029.0,
+      "completions/mean_length": 5835.4140625,
+      "completions/mean_terminated_length": 5406.609375,
+      "completions/min_length": 384.0,
+      "completions/min_terminated_length": 384.0,
+      "entropy": 1.0024723336100578,
+      "epoch": 0.08463661453541858,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0036165034398436546,
+      "learning_rate": 1e-05,
+      "loss": 0.0373,
+      "num_tokens": 68541660.0,
+      "reward": 0.34375,
+      "reward_std": 0.3584783673286438,
+      "rewards/accuracy_reward/mean": 0.34375,
+      "rewards/accuracy_reward/std": 0.47682511806488037,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999669790267944,
+      "sampling/importance_sampling_ratio/min": 9.518130354990717e-06,
+      "sampling/sampling_logp_difference/max": 11.562312126159668,
+      "sampling/sampling_logp_difference/mean": 0.020469525828957558,
+      "step": 92
+    },
+    {
+      "clip_ratio/high_max": 6.105602551542688e-06,
+      "clip_ratio/high_mean": 1.526400637885672e-06,
+      "clip_ratio/low_mean": 5.3129634352444555e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.46560352177039e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15695.0,
+      "completions/mean_length": 6252.609375,
+      "completions/mean_terminated_length": 6172.83447265625,
+      "completions/min_length": 481.0,
+      "completions/min_terminated_length": 481.0,
+      "entropy": 1.0325519517064095,
+      "epoch": 0.08555657773689053,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0022011541295796633,
+      "learning_rate": 1e-05,
+      "loss": 0.036,
+      "num_tokens": 69365418.0,
+      "reward": 0.3828125,
+      "reward_std": 0.32301604747772217,
+      "rewards/accuracy_reward/mean": 0.3828125,
+      "rewards/accuracy_reward/std": 0.4879830479621887,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998809099197388,
+      "sampling/importance_sampling_ratio/min": 0.0005531083443202078,
+      "sampling/sampling_logp_difference/max": 7.4999566078186035,
+      "sampling/sampling_logp_difference/mean": 0.02079072594642639,
+      "step": 93
+    },
+    {
+      "clip_ratio/high_max": 4.348128641140647e-06,
+      "clip_ratio/high_mean": 1.0870321602851618e-06,
+      "clip_ratio/low_mean": 3.0097819148977578e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.118485085451539e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15316.0,
+      "completions/max_terminated_length": 15316.0,
+      "completions/mean_length": 5581.484375,
+      "completions/mean_terminated_length": 5581.484375,
+      "completions/min_length": 461.0,
+      "completions/min_terminated_length": 461.0,
+      "entropy": 0.9222500994801521,
+      "epoch": 0.08647654093836246,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.002300912281498313,
+      "learning_rate": 1e-05,
+      "loss": -0.0007,
+      "num_tokens": 70099320.0,
+      "reward": 0.296875,
+      "reward_std": 0.2959064245223999,
+      "rewards/accuracy_reward/mean": 0.296875,
+      "rewards/accuracy_reward/std": 0.45867621898651123,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998577833175659,
+      "sampling/importance_sampling_ratio/min": 8.140386853483506e-08,
+      "sampling/sampling_logp_difference/max": 16.323843002319336,
+      "sampling/sampling_logp_difference/mean": 0.01952272653579712,
+      "step": 94
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 3.5122252029395895e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.5122252029395895e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15781.0,
+      "completions/max_terminated_length": 15781.0,
+      "completions/mean_length": 5424.140625,
+      "completions/mean_terminated_length": 5424.140625,
+      "completions/min_length": 124.0,
+      "completions/min_terminated_length": 124.0,
+      "entropy": 1.0446564108133316,
+      "epoch": 0.08739650413983441,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0016312639927491546,
+      "learning_rate": 1e-05,
+      "loss": 0.0119,
+      "num_tokens": 70811474.0,
+      "reward": 0.359375,
+      "reward_std": 0.31246691942214966,
+      "rewards/accuracy_reward/mean": 0.359375,
+      "rewards/accuracy_reward/std": 0.481702595949173,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000094175338745,
+      "sampling/importance_sampling_ratio/min": 0.0021919538266956806,
+      "sampling/sampling_logp_difference/max": 6.12296199798584,
+      "sampling/sampling_logp_difference/mean": 0.019741754978895187,
+      "step": 95
+    },
+    {
+      "clip_ratio/high_max": 1.0354576261306647e-05,
+      "clip_ratio/high_mean": 3.496124691082514e-06,
+      "clip_ratio/low_mean": 4.096481598026003e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.446094089871622e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15755.0,
+      "completions/max_terminated_length": 15755.0,
+      "completions/mean_length": 5884.9609375,
+      "completions/mean_terminated_length": 5884.9609375,
+      "completions/min_length": 382.0,
+      "completions/min_terminated_length": 382.0,
+      "entropy": 0.9605691060423851,
+      "epoch": 0.08831646734130635,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0032865386456251144,
+      "learning_rate": 1e-05,
+      "loss": 0.0451,
+      "num_tokens": 71582701.0,
+      "reward": 0.4140625,
+      "reward_std": 0.3514111638069153,
+      "rewards/accuracy_reward/mean": 0.4140625,
+      "rewards/accuracy_reward/std": 0.49449479579925537,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999833106994629,
+      "sampling/importance_sampling_ratio/min": 1.149311810877407e-05,
+      "sampling/sampling_logp_difference/max": 11.373762130737305,
+      "sampling/sampling_logp_difference/mean": 0.019438734278082848,
+      "step": 96
+    },
+    {
+      "clip_ratio/high_max": 1.026998006636859e-05,
+      "clip_ratio/high_mean": 2.5674950165921473e-06,
+      "clip_ratio/low_mean": 3.5440503552308655e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.8007998455213965e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15361.0,
+      "completions/max_terminated_length": 15361.0,
+      "completions/mean_length": 4835.09375,
+      "completions/mean_terminated_length": 4835.09375,
+      "completions/min_length": 826.0,
+      "completions/min_terminated_length": 826.0,
+      "entropy": 0.9038172215223312,
+      "epoch": 0.08923643054277829,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.004721678793430328,
+      "learning_rate": 1e-05,
+      "loss": 0.1143,
+      "num_tokens": 72220025.0,
+      "reward": 0.4765625,
+      "reward_std": 0.38481879234313965,
+      "rewards/accuracy_reward/mean": 0.4765625,
+      "rewards/accuracy_reward/std": 0.5014128684997559,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.99994957447052,
+      "sampling/importance_sampling_ratio/min": 2.710051205667696e-07,
+      "sampling/sampling_logp_difference/max": 15.12112808227539,
+      "sampling/sampling_logp_difference/mean": 0.017888439819216728,
+      "step": 97
+    },
+    {
+      "clip_ratio/high_max": 2.93432283342554e-05,
+      "clip_ratio/high_mean": 9.56252398509605e-06,
+      "clip_ratio/low_mean": 4.7865792453194445e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.742831808674964e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14431.0,
+      "completions/mean_length": 5979.078125,
+      "completions/mean_terminated_length": 5897.1494140625,
+      "completions/min_length": 241.0,
+      "completions/min_terminated_length": 241.0,
+      "entropy": 1.0227951630949974,
+      "epoch": 0.09015639374425023,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0010532280430197716,
+      "learning_rate": 1e-05,
+      "loss": 0.0187,
+      "num_tokens": 73005515.0,
+      "reward": 0.2890625,
+      "reward_std": 0.30115631222724915,
+      "rewards/accuracy_reward/mean": 0.2890625,
+      "rewards/accuracy_reward/std": 0.45510825514793396,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999090433120728,
+      "sampling/importance_sampling_ratio/min": 0.00030157779110595584,
+      "sampling/sampling_logp_difference/max": 8.10648250579834,
+      "sampling/sampling_logp_difference/mean": 0.019633149728178978,
+      "step": 98
+    },
+    {
+      "clip_ratio/high_max": 4.203234766464448e-06,
+      "clip_ratio/high_mean": 1.050808691616112e-06,
+      "clip_ratio/low_mean": 2.5574990331733716e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.6625799137036665e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15886.0,
+      "completions/max_terminated_length": 15886.0,
+      "completions/mean_length": 4292.1796875,
+      "completions/mean_terminated_length": 4292.1796875,
+      "completions/min_length": 159.0,
+      "completions/min_terminated_length": 159.0,
+      "entropy": 0.8719984591007233,
+      "epoch": 0.09107635694572216,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0038324075285345316,
+      "learning_rate": 1e-05,
+      "loss": 0.0669,
+      "num_tokens": 73572794.0,
+      "reward": 0.4375,
+      "reward_std": 0.2972046136856079,
+      "rewards/accuracy_reward/mean": 0.4375,
+      "rewards/accuracy_reward/std": 0.49802759289741516,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999188780784607,
+      "sampling/importance_sampling_ratio/min": 0.015675775706768036,
+      "sampling/sampling_logp_difference/max": 4.155638694763184,
+      "sampling/sampling_logp_difference/mean": 0.018074234947562218,
+      "step": 99
+    },
+    {
+      "clip_ratio/high_max": 4.431366960488958e-06,
+      "clip_ratio/high_mean": 1.1078417401222396e-06,
+      "clip_ratio/low_mean": 4.433405501913512e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.54418968729442e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 14674.0,
+      "completions/max_terminated_length": 14674.0,
+      "completions/mean_length": 5449.2890625,
+      "completions/mean_terminated_length": 5449.2890625,
+      "completions/min_length": 635.0,
+      "completions/min_terminated_length": 635.0,
+      "entropy": 0.9137986451387405,
+      "epoch": 0.09199632014719411,
+      "frac_reward_zero_std": 0.125,
+      "grad_norm": 0.004843447357416153,
+      "learning_rate": 1e-05,
+      "loss": 0.0166,
+      "num_tokens": 74289607.0,
+      "reward": 0.5,
+      "reward_std": 0.40609243512153625,
+      "rewards/accuracy_reward/mean": 0.5,
+      "rewards/accuracy_reward/std": 0.5019646286964417,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999977707862854,
+      "sampling/importance_sampling_ratio/min": 8.851584993863071e-07,
+      "sampling/sampling_logp_difference/max": 13.937499046325684,
+      "sampling/sampling_logp_difference/mean": 0.018183842301368713,
+      "step": 100
+    },
+    {
+      "clip_ratio/high_max": 8.212076863856055e-06,
+      "clip_ratio/high_mean": 2.0530192159640137e-06,
+      "clip_ratio/low_mean": 3.6279372466196946e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.833239122741361e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 16163.0,
+      "completions/max_terminated_length": 16163.0,
+      "completions/mean_length": 4983.3515625,
+      "completions/mean_terminated_length": 4983.3515625,
+      "completions/min_length": 541.0,
+      "completions/min_terminated_length": 541.0,
+      "entropy": 0.9354705810546875,
+      "epoch": 0.09291628334866606,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0037651765160262585,
+      "learning_rate": 1e-05,
+      "loss": 0.0463,
+      "num_tokens": 74946484.0,
+      "reward": 0.3671875,
+      "reward_std": 0.3090519309043884,
+      "rewards/accuracy_reward/mean": 0.3671875,
+      "rewards/accuracy_reward/std": 0.4839322865009308,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999549984931946,
+      "sampling/importance_sampling_ratio/min": 0.00011593531962716952,
+      "sampling/sampling_logp_difference/max": 9.062478065490723,
+      "sampling/sampling_logp_difference/mean": 0.018207306042313576,
+      "step": 101
+    },
+    {
+      "clip_ratio/high_max": 1.3182888324081432e-05,
+      "clip_ratio/high_mean": 3.295722081020358e-06,
+      "clip_ratio/low_mean": 2.544108633628639e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.8736808644680423e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16039.0,
+      "completions/mean_length": 6351.1015625,
+      "completions/mean_terminated_length": 6027.45947265625,
+      "completions/min_length": 804.0,
+      "completions/min_terminated_length": 804.0,
+      "entropy": 0.9310042560100555,
+      "epoch": 0.09383624655013799,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0009160125628113747,
+      "learning_rate": 1e-05,
+      "loss": -0.023,
+      "num_tokens": 75779145.0,
+      "reward": 0.3828125,
+      "reward_std": 0.24329257011413574,
+      "rewards/accuracy_reward/mean": 0.3828125,
+      "rewards/accuracy_reward/std": 0.4879830479621887,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998877048492432,
+      "sampling/importance_sampling_ratio/min": 0.0002961359277833253,
+      "sampling/sampling_logp_difference/max": 8.1246919631958,
+      "sampling/sampling_logp_difference/mean": 0.018513178452849388,
+      "step": 102
+    },
+    {
+      "clip_ratio/high_max": 1.1402620202716207e-05,
+      "clip_ratio/high_mean": 3.935649147024378e-06,
+      "clip_ratio/low_mean": 3.059757568735222e-05,
+      "clip_ratio/low_min": 4.3258582991256844e-06,
+      "clip_ratio/region_mean": 3.45332257438713e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14471.0,
+      "completions/mean_length": 5293.40625,
+      "completions/mean_terminated_length": 4935.64501953125,
+      "completions/min_length": 222.0,
+      "completions/min_terminated_length": 222.0,
+      "entropy": 1.0732879787683487,
+      "epoch": 0.09475620975160993,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0023993055801838636,
+      "learning_rate": 1e-05,
+      "loss": 0.1021,
+      "num_tokens": 76475557.0,
+      "reward": 0.34375,
+      "reward_std": 0.2835350036621094,
+      "rewards/accuracy_reward/mean": 0.34375,
+      "rewards/accuracy_reward/std": 0.47682511806488037,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.000077724456787,
+      "sampling/importance_sampling_ratio/min": 6.613240111619234e-05,
+      "sampling/sampling_logp_difference/max": 9.623851776123047,
+      "sampling/sampling_logp_difference/mean": 0.020792219787836075,
+      "step": 103
+    },
+    {
+      "clip_ratio/high_max": 2.130644793396641e-05,
+      "clip_ratio/high_mean": 8.929533635182452e-06,
+      "clip_ratio/low_mean": 2.663600798769039e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.556554071337814e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16305.0,
+      "completions/mean_length": 7619.7578125,
+      "completions/mean_terminated_length": 7409.41650390625,
+      "completions/min_length": 25.0,
+      "completions/min_terminated_length": 25.0,
+      "entropy": 0.9646238535642624,
+      "epoch": 0.09567617295308188,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0014872358879074454,
+      "learning_rate": 1e-05,
+      "loss": 0.0439,
+      "num_tokens": 77474310.0,
+      "reward": 0.34375,
+      "reward_std": 0.33114904165267944,
+      "rewards/accuracy_reward/mean": 0.34375,
+      "rewards/accuracy_reward/std": 0.47682511806488037,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999638795852661,
+      "sampling/importance_sampling_ratio/min": 0.0016686831368133426,
+      "sampling/sampling_logp_difference/max": 6.395720481872559,
+      "sampling/sampling_logp_difference/mean": 0.020074717700481415,
+      "step": 104
+    },
+    {
+      "clip_ratio/high_max": 1.7765815300663235e-05,
+      "clip_ratio/high_mean": 5.154013138053415e-06,
+      "clip_ratio/low_mean": 5.166909659237717e-05,
+      "clip_ratio/low_min": 8.365680514543783e-06,
+      "clip_ratio/region_mean": 5.68231100714911e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15984.0,
+      "completions/max_terminated_length": 15984.0,
+      "completions/mean_length": 5959.921875,
+      "completions/mean_terminated_length": 5959.921875,
+      "completions/min_length": 55.0,
+      "completions/min_terminated_length": 55.0,
+      "entropy": 1.004471093416214,
+      "epoch": 0.09659613615455381,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.00398358516395092,
+      "learning_rate": 1e-05,
+      "loss": 0.1016,
+      "num_tokens": 78257132.0,
+      "reward": 0.359375,
+      "reward_std": 0.3653082847595215,
+      "rewards/accuracy_reward/mean": 0.359375,
+      "rewards/accuracy_reward/std": 0.481702595949173,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000170469284058,
+      "sampling/importance_sampling_ratio/min": 0.0030075267422944307,
+      "sampling/sampling_logp_difference/max": 5.806637287139893,
+      "sampling/sampling_logp_difference/mean": 0.020755283534526825,
+      "step": 105
+    },
+    {
+      "clip_ratio/high_max": 1.6946955838648137e-05,
+      "clip_ratio/high_mean": 4.236738959662034e-06,
+      "clip_ratio/low_mean": 4.510891039899434e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.934564867653535e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 13736.0,
+      "completions/mean_length": 5427.03125,
+      "completions/mean_terminated_length": 5340.755859375,
+      "completions/min_length": 556.0,
+      "completions/min_terminated_length": 556.0,
+      "entropy": 0.9117375314235687,
+      "epoch": 0.09751609935602576,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0019883522763848305,
+      "learning_rate": 1e-05,
+      "loss": 0.01,
+      "num_tokens": 78971072.0,
+      "reward": 0.375,
+      "reward_std": 0.31694266200065613,
+      "rewards/accuracy_reward/mean": 0.375,
+      "rewards/accuracy_reward/std": 0.4860251843929291,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000550746917725,
+      "sampling/importance_sampling_ratio/min": 0.0008046010043472052,
+      "sampling/sampling_logp_difference/max": 7.125164031982422,
+      "sampling/sampling_logp_difference/mean": 0.018812140449881554,
+      "step": 106
+    },
+    {
+      "clip_ratio/high_max": 2.968176841022796e-05,
+      "clip_ratio/high_mean": 7.42044210255699e-06,
+      "clip_ratio/low_mean": 3.220799408154562e-05,
+      "clip_ratio/low_min": 5.315981979947537e-06,
+      "clip_ratio/region_mean": 3.962843629778945e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 16293.0,
+      "completions/max_terminated_length": 16293.0,
+      "completions/mean_length": 6062.078125,
+      "completions/mean_terminated_length": 6062.078125,
+      "completions/min_length": 108.0,
+      "completions/min_terminated_length": 108.0,
+      "entropy": 1.0164100378751755,
+      "epoch": 0.0984360625574977,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.00450351694598794,
+      "learning_rate": 1e-05,
+      "loss": 0.0426,
+      "num_tokens": 79764434.0,
+      "reward": 0.2578125,
+      "reward_std": 0.26355957984924316,
+      "rewards/accuracy_reward/mean": 0.2578125,
+      "rewards/accuracy_reward/std": 0.43914902210235596,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999713897705078,
+      "sampling/importance_sampling_ratio/min": 0.0007411236292682588,
+      "sampling/sampling_logp_difference/max": 7.207343101501465,
+      "sampling/sampling_logp_difference/mean": 0.020526543259620667,
+      "step": 107
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 4.856050622947805e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.856050622947805e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 13689.0,
+      "completions/max_terminated_length": 13689.0,
+      "completions/mean_length": 4856.53125,
+      "completions/mean_terminated_length": 4856.53125,
+      "completions/min_length": 191.0,
+      "completions/min_terminated_length": 191.0,
+      "entropy": 1.0780886858701706,
+      "epoch": 0.09935602575896964,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.0033157530706375837,
+      "learning_rate": 1e-05,
+      "loss": 0.046,
+      "num_tokens": 80405238.0,
+      "reward": 0.3359375,
+      "reward_std": 0.3487703502178192,
+      "rewards/accuracy_reward/mean": 0.3359375,
+      "rewards/accuracy_reward/std": 0.47417303919792175,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999889135360718,
+      "sampling/importance_sampling_ratio/min": 0.033773623406887054,
+      "sampling/sampling_logp_difference/max": 3.7256407737731934,
+      "sampling/sampling_logp_difference/mean": 0.019188418984413147,
+      "step": 108
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 1.975351790406421e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 1.975351790406421e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 16335.0,
+      "completions/max_terminated_length": 16335.0,
+      "completions/mean_length": 3930.5859375,
+      "completions/mean_terminated_length": 3930.5859375,
+      "completions/min_length": 4.0,
+      "completions/min_terminated_length": 4.0,
+      "entropy": 0.8666863515973091,
+      "epoch": 0.10027598896044158,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.005471619311720133,
+      "learning_rate": 1e-05,
+      "loss": -0.0779,
+      "num_tokens": 80926721.0,
+      "reward": 0.5859375,
+      "reward_std": 0.3164186179637909,
+      "rewards/accuracy_reward/mean": 0.5859375,
+      "rewards/accuracy_reward/std": 0.49449479579925537,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000040531158447,
+      "sampling/importance_sampling_ratio/min": 0.0002562212466727942,
+      "sampling/sampling_logp_difference/max": 8.269469261169434,
+      "sampling/sampling_logp_difference/mean": 0.017708823084831238,
+      "step": 109
+    },
+    {
+      "clip_ratio/high_max": 6.743997801095247e-06,
+      "clip_ratio/high_mean": 1.6859994502738118e-06,
+      "clip_ratio/low_mean": 3.61007656692891e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.7786765119562915e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15546.0,
+      "completions/mean_length": 5934.9453125,
+      "completions/mean_terminated_length": 5684.16845703125,
+      "completions/min_length": 229.0,
+      "completions/min_terminated_length": 229.0,
+      "entropy": 0.9991667941212654,
+      "epoch": 0.10119595216191353,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.002580739092081785,
+      "learning_rate": 1e-05,
+      "loss": -0.0065,
+      "num_tokens": 81707978.0,
+      "reward": 0.3046875,
+      "reward_std": 0.24671243131160736,
+      "rewards/accuracy_reward/mean": 0.3046875,
+      "rewards/accuracy_reward/std": 0.46208351850509644,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000852346420288,
+      "sampling/importance_sampling_ratio/min": 0.002478762762621045,
+      "sampling/sampling_logp_difference/max": 5.999995708465576,
+      "sampling/sampling_logp_difference/mean": 0.019801246002316475,
+      "step": 110
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 2.43532002741631e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.43532002741631e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16010.0,
+      "completions/mean_length": 5866.84375,
+      "completions/mean_terminated_length": 5699.9052734375,
+      "completions/min_length": 499.0,
+      "completions/min_terminated_length": 499.0,
+      "entropy": 0.9848997294902802,
+      "epoch": 0.10211591536338546,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0010949905263260007,
+      "learning_rate": 1e-05,
+      "loss": 0.0266,
+      "num_tokens": 82477310.0,
+      "reward": 0.2734375,
+      "reward_std": 0.26933354139328003,
+      "rewards/accuracy_reward/mean": 0.2734375,
+      "rewards/accuracy_reward/std": 0.447474867105484,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999667406082153,
+      "sampling/importance_sampling_ratio/min": 9.04304688447155e-05,
+      "sampling/sampling_logp_difference/max": 9.310929298400879,
+      "sampling/sampling_logp_difference/mean": 0.020769795402884483,
+      "step": 111
+    },
+    {
+      "clip_ratio/high_max": 1.9307613456476247e-05,
+      "clip_ratio/high_mean": 4.826903364119062e-06,
+      "clip_ratio/low_mean": 5.842190330440644e-05,
+      "clip_ratio/low_min": 1.2287753634154797e-05,
+      "clip_ratio/region_mean": 6.324880496322294e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 14501.0,
+      "completions/max_terminated_length": 14501.0,
+      "completions/mean_length": 6613.7578125,
+      "completions/mean_terminated_length": 6613.7578125,
+      "completions/min_length": 1033.0,
+      "completions/min_terminated_length": 1033.0,
+      "entropy": 0.9176012054085732,
+      "epoch": 0.10303587856485741,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0020384234376251698,
+      "learning_rate": 1e-05,
+      "loss": 0.0571,
+      "num_tokens": 83345055.0,
+      "reward": 0.3671875,
+      "reward_std": 0.2845958471298218,
+      "rewards/accuracy_reward/mean": 0.3671875,
+      "rewards/accuracy_reward/std": 0.4839322865009308,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999457001686096,
+      "sampling/importance_sampling_ratio/min": 0.029541675001382828,
+      "sampling/sampling_logp_difference/max": 3.5219533443450928,
+      "sampling/sampling_logp_difference/mean": 0.018883168697357178,
+      "step": 112
+    },
+    {
+      "clip_ratio/high_max": 1.382043183184578e-05,
+      "clip_ratio/high_mean": 3.455107957961445e-06,
+      "clip_ratio/low_mean": 5.789885449303256e-05,
+      "clip_ratio/low_min": 1.017130716718384e-05,
+      "clip_ratio/region_mean": 6.135396188255982e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16310.0,
+      "completions/mean_length": 6392.3125,
+      "completions/mean_terminated_length": 6070.0,
+      "completions/min_length": 507.0,
+      "completions/min_terminated_length": 507.0,
+      "entropy": 0.904954232275486,
+      "epoch": 0.10395584176632934,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0031166900880634785,
+      "learning_rate": 1e-05,
+      "loss": 0.0351,
+      "num_tokens": 84186343.0,
+      "reward": 0.390625,
+      "reward_std": 0.34139877557754517,
+      "rewards/accuracy_reward/mean": 0.390625,
+      "rewards/accuracy_reward/std": 0.4898075461387634,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999208450317383,
+      "sampling/importance_sampling_ratio/min": 0.00022529886336997151,
+      "sampling/sampling_logp_difference/max": 8.398082733154297,
+      "sampling/sampling_logp_difference/mean": 0.01931958645582199,
+      "step": 113
+    },
+    {
+      "clip_ratio/high_max": 1.7221671441802755e-05,
+      "clip_ratio/high_mean": 6.549099907715572e-06,
+      "clip_ratio/low_mean": 3.147818074467068e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.802728065238625e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16180.0,
+      "completions/mean_length": 5982.703125,
+      "completions/mean_terminated_length": 5817.603515625,
+      "completions/min_length": 294.0,
+      "completions/min_terminated_length": 294.0,
+      "entropy": 0.8394555225968361,
+      "epoch": 0.10487580496780129,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0022041688207536936,
+      "learning_rate": 1e-05,
+      "loss": 0.1043,
+      "num_tokens": 84971129.0,
+      "reward": 0.3125,
+      "reward_std": 0.30774885416030884,
+      "rewards/accuracy_reward/mean": 0.3125,
+      "rewards/accuracy_reward/std": 0.4653336703777313,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999030828475952,
+      "sampling/importance_sampling_ratio/min": 1.553593506287143e-06,
+      "sampling/sampling_logp_difference/max": 13.374939918518066,
+      "sampling/sampling_logp_difference/mean": 0.01795877143740654,
+      "step": 114
+    },
+    {
+      "clip_ratio/high_max": 2.9651660042873118e-05,
+      "clip_ratio/high_mean": 9.398806923854863e-06,
+      "clip_ratio/low_mean": 4.788733849636628e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.728614519284747e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14988.0,
+      "completions/mean_length": 4976.921875,
+      "completions/mean_terminated_length": 4608.95166015625,
+      "completions/min_length": 335.0,
+      "completions/min_terminated_length": 335.0,
+      "entropy": 0.8381234556436539,
+      "epoch": 0.10579576816927323,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.0037972736172378063,
+      "learning_rate": 1e-05,
+      "loss": 0.1244,
+      "num_tokens": 85625559.0,
+      "reward": 0.4765625,
+      "reward_std": 0.39082521200180054,
+      "rewards/accuracy_reward/mean": 0.4765625,
+      "rewards/accuracy_reward/std": 0.5014128684997559,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999970555305481,
+      "sampling/importance_sampling_ratio/min": 0.002990707289427519,
+      "sampling/sampling_logp_difference/max": 5.8122453689575195,
+      "sampling/sampling_logp_difference/mean": 0.01815030723810196,
+      "step": 115
+    },
+    {
+      "clip_ratio/high_max": 4.130592969886493e-06,
+      "clip_ratio/high_mean": 1.0326482424716232e-06,
+      "clip_ratio/low_mean": 1.6904315600640984e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 1.7936963843112608e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15984.0,
+      "completions/mean_length": 6307.2421875,
+      "completions/mean_terminated_length": 6065.400390625,
+      "completions/min_length": 823.0,
+      "completions/min_terminated_length": 823.0,
+      "entropy": 1.1176434755325317,
+      "epoch": 0.10671573137074516,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0012413962977007031,
+      "learning_rate": 1e-05,
+      "loss": 0.0146,
+      "num_tokens": 86453606.0,
+      "reward": 0.28125,
+      "reward_std": 0.2280253767967224,
+      "rewards/accuracy_reward/mean": 0.28125,
+      "rewards/accuracy_reward/std": 0.4513758420944214,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000064373016357,
+      "sampling/importance_sampling_ratio/min": 0.004730688873678446,
+      "sampling/sampling_logp_difference/max": 5.353684425354004,
+      "sampling/sampling_logp_difference/mean": 0.021790307015180588,
+      "step": 116
+    },
+    {
+      "clip_ratio/high_max": 1.3160772823539446e-05,
+      "clip_ratio/high_mean": 3.2901932058848615e-06,
+      "clip_ratio/low_mean": 3.582628983167524e-05,
+      "clip_ratio/low_min": 2.61966624748311e-06,
+      "clip_ratio/region_mean": 3.911648195753514e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16329.0,
+      "completions/mean_length": 7263.1640625,
+      "completions/mean_terminated_length": 7044.26416015625,
+      "completions/min_length": 48.0,
+      "completions/min_terminated_length": 48.0,
+      "entropy": 1.107876107096672,
+      "epoch": 0.10763569457221711,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0017762042116373777,
+      "learning_rate": 1e-05,
+      "loss": 0.0349,
+      "num_tokens": 87402763.0,
+      "reward": 0.2578125,
+      "reward_std": 0.27776598930358887,
+      "rewards/accuracy_reward/mean": 0.2578125,
+      "rewards/accuracy_reward/std": 0.43914902210235596,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999741315841675,
+      "sampling/importance_sampling_ratio/min": 0.0009408573969267309,
+      "sampling/sampling_logp_difference/max": 6.968719005584717,
+      "sampling/sampling_logp_difference/mean": 0.02103034406900406,
+      "step": 117
+    },
+    {
+      "clip_ratio/high_max": 3.987745776612428e-05,
+      "clip_ratio/high_mean": 1.1877163728968299e-05,
+      "clip_ratio/low_mean": 4.26799579145154e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.455712096136267e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15416.0,
+      "completions/mean_length": 5093.859375,
+      "completions/mean_terminated_length": 4914.65087890625,
+      "completions/min_length": 364.0,
+      "completions/min_terminated_length": 364.0,
+      "entropy": 1.1065888702869415,
+      "epoch": 0.10855565777368906,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0032127038575708866,
+      "learning_rate": 1e-05,
+      "loss": 0.0194,
+      "num_tokens": 88077385.0,
+      "reward": 0.421875,
+      "reward_std": 0.345874547958374,
+      "rewards/accuracy_reward/mean": 0.421875,
+      "rewards/accuracy_reward/std": 0.4957992732524872,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999171495437622,
+      "sampling/importance_sampling_ratio/min": 7.033879228401929e-05,
+      "sampling/sampling_logp_difference/max": 9.562187194824219,
+      "sampling/sampling_logp_difference/mean": 0.020314980298280716,
+      "step": 118
+    },
+    {
+      "clip_ratio/high_max": 9.35208754526684e-06,
+      "clip_ratio/high_mean": 4.4788730519940145e-06,
+      "clip_ratio/low_mean": 3.470697703278347e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.918584917528278e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15740.0,
+      "completions/mean_length": 6943.53125,
+      "completions/mean_terminated_length": 6639.0,
+      "completions/min_length": 307.0,
+      "completions/min_terminated_length": 307.0,
+      "entropy": 0.9009081721305847,
+      "epoch": 0.10947562097516099,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0028925195802003145,
+      "learning_rate": 1e-05,
+      "loss": 0.0862,
+      "num_tokens": 88985269.0,
+      "reward": 0.3984375,
+      "reward_std": 0.3535328209400177,
+      "rewards/accuracy_reward/mean": 0.3984375,
+      "rewards/accuracy_reward/std": 0.4915000796318054,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999980628490448,
+      "sampling/importance_sampling_ratio/min": 6.553035092338177e-08,
+      "sampling/sampling_logp_difference/max": 16.540752410888672,
+      "sampling/sampling_logp_difference/mean": 0.019378282129764557,
+      "step": 119
+    },
+    {
+      "clip_ratio/high_max": 1.0939961612166371e-05,
+      "clip_ratio/high_mean": 2.734990403041593e-06,
+      "clip_ratio/low_mean": 2.4615862798782473e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.7350853201824066e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15148.0,
+      "completions/max_terminated_length": 15148.0,
+      "completions/mean_length": 4976.25,
+      "completions/mean_terminated_length": 4976.25,
+      "completions/min_length": 702.0,
+      "completions/min_terminated_length": 702.0,
+      "entropy": 0.9463540017604828,
+      "epoch": 0.11039558417663294,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0017386430408805609,
+      "learning_rate": 1e-05,
+      "loss": 0.0215,
+      "num_tokens": 89645205.0,
+      "reward": 0.359375,
+      "reward_std": 0.26462042331695557,
+      "rewards/accuracy_reward/mean": 0.359375,
+      "rewards/accuracy_reward/std": 0.481702595949173,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999554753303528,
+      "sampling/importance_sampling_ratio/min": 7.889595508459024e-06,
+      "sampling/sampling_logp_difference/max": 11.74996566772461,
+      "sampling/sampling_logp_difference/mean": 0.018035830929875374,
+      "step": 120
+    },
+    {
+      "clip_ratio/high_max": 5.941629297012696e-06,
+      "clip_ratio/high_mean": 1.485407324253174e-06,
+      "clip_ratio/low_mean": 2.6826061798601586e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.8311469009167922e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15892.0,
+      "completions/mean_length": 6439.5390625,
+      "completions/mean_terminated_length": 6281.69091796875,
+      "completions/min_length": 959.0,
+      "completions/min_terminated_length": 959.0,
+      "entropy": 0.899876207113266,
+      "epoch": 0.11131554737810488,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0037381781730800867,
+      "learning_rate": 1e-05,
+      "loss": 0.0666,
+      "num_tokens": 90489394.0,
+      "reward": 0.3203125,
+      "reward_std": 0.2624938488006592,
+      "rewards/accuracy_reward/mean": 0.3203125,
+      "rewards/accuracy_reward/std": 0.4684300124645233,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999206066131592,
+      "sampling/importance_sampling_ratio/min": 0.003606764366850257,
+      "sampling/sampling_logp_difference/max": 5.62494421005249,
+      "sampling/sampling_logp_difference/mean": 0.019368179142475128,
+      "step": 121
+    },
+    {
+      "clip_ratio/high_max": 5.189952389628161e-06,
+      "clip_ratio/high_mean": 1.2974880974070402e-06,
+      "clip_ratio/low_mean": 3.058137212974543e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.187886022715247e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15979.0,
+      "completions/mean_length": 6876.46875,
+      "completions/mean_terminated_length": 6408.884765625,
+      "completions/min_length": 4.0,
+      "completions/min_terminated_length": 4.0,
+      "entropy": 1.1018569767475128,
+      "epoch": 0.11223551057957681,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0018562980694696307,
+      "learning_rate": 1e-05,
+      "loss": 0.095,
+      "num_tokens": 91390054.0,
+      "reward": 0.21875,
+      "reward_std": 0.29955869913101196,
+      "rewards/accuracy_reward/mean": 0.21875,
+      "rewards/accuracy_reward/std": 0.41502299904823303,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999849796295166,
+      "sampling/importance_sampling_ratio/min": 2.9343695132411085e-05,
+      "sampling/sampling_logp_difference/max": 10.436432838439941,
+      "sampling/sampling_logp_difference/mean": 0.020825792104005814,
+      "step": 122
+    },
+    {
+      "clip_ratio/high_max": 2.022083435804234e-05,
+      "clip_ratio/high_mean": 5.055208589510585e-06,
+      "clip_ratio/low_mean": 3.029032552603894e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.53455343429232e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14153.0,
+      "completions/mean_length": 6501.5078125,
+      "completions/mean_terminated_length": 6344.64306640625,
+      "completions/min_length": 720.0,
+      "completions/min_terminated_length": 720.0,
+      "entropy": 1.073579266667366,
+      "epoch": 0.11315547378104876,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0016695430967956781,
+      "learning_rate": 1e-05,
+      "loss": 0.0552,
+      "num_tokens": 92241535.0,
+      "reward": 0.2734375,
+      "reward_std": 0.28641316294670105,
+      "rewards/accuracy_reward/mean": 0.2734375,
+      "rewards/accuracy_reward/std": 0.447474867105484,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998984336853027,
+      "sampling/importance_sampling_ratio/min": 0.0002380236255703494,
+      "sampling/sampling_logp_difference/max": 8.343140602111816,
+      "sampling/sampling_logp_difference/mean": 0.020438479259610176,
+      "step": 123
+    },
+    {
+      "clip_ratio/high_max": 3.3911180707946187e-06,
+      "clip_ratio/high_mean": 8.477795176986547e-07,
+      "clip_ratio/low_mean": 2.2190370486896427e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.30381500614385e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 14345.0,
+      "completions/max_terminated_length": 14345.0,
+      "completions/mean_length": 5474.1328125,
+      "completions/mean_terminated_length": 5474.1328125,
+      "completions/min_length": 14.0,
+      "completions/min_terminated_length": 14.0,
+      "entropy": 1.0692576617002487,
+      "epoch": 0.1140754369825207,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0034909825772047043,
+      "learning_rate": 1e-05,
+      "loss": 0.0,
+      "num_tokens": 92962472.0,
+      "reward": 0.3046875,
+      "reward_std": 0.27564430236816406,
+      "rewards/accuracy_reward/mean": 0.3046875,
+      "rewards/accuracy_reward/std": 0.46208351850509644,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.000006079673767,
+      "sampling/importance_sampling_ratio/min": 0.0017851731972768903,
+      "sampling/sampling_logp_difference/max": 6.328239917755127,
+      "sampling/sampling_logp_difference/mean": 0.019930578768253326,
+      "step": 124
+    },
+    {
+      "clip_ratio/high_max": 2.6292200345778838e-05,
+      "clip_ratio/high_mean": 7.620442374900449e-06,
+      "clip_ratio/low_mean": 4.615546390596137e-05,
+      "clip_ratio/low_min": 1.366510537081922e-05,
+      "clip_ratio/region_mean": 5.3775906508235494e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16198.0,
+      "completions/mean_length": 7512.078125,
+      "completions/mean_terminated_length": 7225.88671875,
+      "completions/min_length": 486.0,
+      "completions/min_terminated_length": 486.0,
+      "entropy": 0.9676955863833427,
+      "epoch": 0.11499540018399264,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0023449272848665714,
+      "learning_rate": 1e-05,
+      "loss": 0.0454,
+      "num_tokens": 93950506.0,
+      "reward": 0.3203125,
+      "reward_std": 0.22461043298244476,
+      "rewards/accuracy_reward/mean": 0.3203125,
+      "rewards/accuracy_reward/std": 0.4684300124645233,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999359250068665,
+      "sampling/importance_sampling_ratio/min": 0.0016406332142651081,
+      "sampling/sampling_logp_difference/max": 6.412672996520996,
+      "sampling/sampling_logp_difference/mean": 0.020141655579209328,
+      "step": 125
+    },
+    {
+      "clip_ratio/high_max": 5.097255780128762e-06,
+      "clip_ratio/high_mean": 1.2743139450321905e-06,
+      "clip_ratio/low_mean": 3.3802551342887455e-05,
+      "clip_ratio/low_min": 4.146762421441963e-06,
+      "clip_ratio/region_mean": 3.5076865287919645e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16183.0,
+      "completions/mean_length": 6920.484375,
+      "completions/mean_terminated_length": 6693.3603515625,
+      "completions/min_length": 962.0,
+      "completions/min_terminated_length": 962.0,
+      "entropy": 0.8662540689110756,
+      "epoch": 0.11591536338546458,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0037103090435266495,
+      "learning_rate": 1e-05,
+      "loss": 0.0617,
+      "num_tokens": 94854016.0,
+      "reward": 0.4375,
+      "reward_std": 0.322716623544693,
+      "rewards/accuracy_reward/mean": 0.4375,
+      "rewards/accuracy_reward/std": 0.49802759289741516,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999761581420898,
+      "sampling/importance_sampling_ratio/min": 0.00047686786274425685,
+      "sampling/sampling_logp_difference/max": 7.648271083831787,
+      "sampling/sampling_logp_difference/mean": 0.01915796287357807,
+      "step": 126
+    },
+    {
+      "clip_ratio/high_max": 8.4922439782531e-06,
+      "clip_ratio/high_mean": 2.123060994563275e-06,
+      "clip_ratio/low_mean": 5.024227584726759e-05,
+      "clip_ratio/low_min": 1.3627016414829995e-05,
+      "clip_ratio/region_mean": 5.236533706920454e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15867.0,
+      "completions/mean_length": 7939.609375,
+      "completions/mean_terminated_length": 7805.57177734375,
+      "completions/min_length": 1260.0,
+      "completions/min_terminated_length": 1260.0,
+      "entropy": 0.9707008600234985,
+      "epoch": 0.11683532658693652,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0024642283096909523,
+      "learning_rate": 1e-05,
+      "loss": 0.0788,
+      "num_tokens": 95889966.0,
+      "reward": 0.2265625,
+      "reward_std": 0.27434611320495605,
+      "rewards/accuracy_reward/mean": 0.2265625,
+      "rewards/accuracy_reward/std": 0.4202519655227661,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998771548271179,
+      "sampling/importance_sampling_ratio/min": 4.540014560916461e-05,
+      "sampling/sampling_logp_difference/max": 9.999995231628418,
+      "sampling/sampling_logp_difference/mean": 0.020453302189707756,
+      "step": 127
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 3.766829564710861e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.766829564710861e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14969.0,
+      "completions/mean_length": 5985.8203125,
+      "completions/mean_terminated_length": 5474.43408203125,
+      "completions/min_length": 315.0,
+      "completions/min_terminated_length": 315.0,
+      "entropy": 0.9083090648055077,
+      "epoch": 0.11775528978840846,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.003317479742690921,
+      "learning_rate": 1e-05,
+      "loss": 0.0537,
+      "num_tokens": 96676847.0,
+      "reward": 0.3671875,
+      "reward_std": 0.287486732006073,
+      "rewards/accuracy_reward/mean": 0.3671875,
+      "rewards/accuracy_reward/std": 0.4839322865009308,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999130964279175,
+      "sampling/importance_sampling_ratio/min": 0.000286750087980181,
+      "sampling/sampling_logp_difference/max": 8.156899452209473,
+      "sampling/sampling_logp_difference/mean": 0.01996719278395176,
+      "step": 128
+    },
+    {
+      "clip_ratio/high_max": 1.8439853647578275e-05,
+      "clip_ratio/high_mean": 4.609963411894569e-06,
+      "clip_ratio/low_mean": 5.708034223061986e-05,
+      "clip_ratio/low_min": 2.75287948170444e-06,
+      "clip_ratio/region_mean": 6.169030598357494e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15081.0,
+      "completions/mean_length": 6565.359375,
+      "completions/mean_terminated_length": 6488.04736328125,
+      "completions/min_length": 248.0,
+      "completions/min_terminated_length": 248.0,
+      "entropy": 1.1013468354940414,
+      "epoch": 0.11867525298988041,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0019073591101914644,
+      "learning_rate": 1e-05,
+      "loss": 0.0622,
+      "num_tokens": 97539453.0,
+      "reward": 0.2734375,
+      "reward_std": 0.307217001914978,
+      "rewards/accuracy_reward/mean": 0.2734375,
+      "rewards/accuracy_reward/std": 0.447474867105484,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999555945396423,
+      "sampling/importance_sampling_ratio/min": 0.0006022047018632293,
+      "sampling/sampling_logp_difference/max": 7.414913177490234,
+      "sampling/sampling_logp_difference/mean": 0.02150837704539299,
+      "step": 129
+    },
+    {
+      "clip_ratio/high_max": 9.068485269381199e-06,
+      "clip_ratio/high_mean": 2.2671213173452998e-06,
+      "clip_ratio/low_mean": 1.9822365402433206e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.208948649240483e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16099.0,
+      "completions/mean_length": 6779.6171875,
+      "completions/mean_terminated_length": 6703.9921875,
+      "completions/min_length": 4.0,
+      "completions/min_terminated_length": 4.0,
+      "entropy": 0.8940552547574043,
+      "epoch": 0.11959521619135234,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0010163087863475084,
+      "learning_rate": 1e-05,
+      "loss": 0.0249,
+      "num_tokens": 98429036.0,
+      "reward": 0.453125,
+      "reward_std": 0.22567126154899597,
+      "rewards/accuracy_reward/mean": 0.453125,
+      "rewards/accuracy_reward/std": 0.4997538626194,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999485015869141,
+      "sampling/importance_sampling_ratio/min": 3.464699460664633e-08,
+      "sampling/sampling_logp_difference/max": 17.178054809570312,
+      "sampling/sampling_logp_difference/mean": 0.018716152757406235,
+      "step": 130
+    },
+    {
+      "clip_ratio/high_max": 5.047242211730918e-06,
+      "clip_ratio/high_mean": 1.2618105529327295e-06,
+      "clip_ratio/low_mean": 2.9014110396019532e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.0275920835265424e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 14549.0,
+      "completions/max_terminated_length": 14549.0,
+      "completions/mean_length": 5766.71875,
+      "completions/mean_terminated_length": 5766.71875,
+      "completions/min_length": 47.0,
+      "completions/min_terminated_length": 47.0,
+      "entropy": 1.0455922111868858,
+      "epoch": 0.12051517939282429,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.002155766822397709,
+      "learning_rate": 1e-05,
+      "loss": 0.0238,
+      "num_tokens": 99184264.0,
+      "reward": 0.4140625,
+      "reward_std": 0.3077537715435028,
+      "rewards/accuracy_reward/mean": 0.4140625,
+      "rewards/accuracy_reward/std": 0.49449479579925537,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999253749847412,
+      "sampling/importance_sampling_ratio/min": 0.00010798005678225309,
+      "sampling/sampling_logp_difference/max": 9.133563995361328,
+      "sampling/sampling_logp_difference/mean": 0.020948775112628937,
+      "step": 131
+    },
+    {
+      "clip_ratio/high_max": 2.0882574972347356e-05,
+      "clip_ratio/high_mean": 6.505383225885453e-06,
+      "clip_ratio/low_mean": 4.496008500609605e-05,
+      "clip_ratio/low_min": 7.757854064038838e-06,
+      "clip_ratio/region_mean": 5.1465468231981504e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14704.0,
+      "completions/mean_length": 6167.2421875,
+      "completions/mean_terminated_length": 6005.07177734375,
+      "completions/min_length": 218.0,
+      "completions/min_terminated_length": 218.0,
+      "entropy": 0.9100174158811569,
+      "epoch": 0.12143514259429623,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.0021464223973453045,
+      "learning_rate": 1e-05,
+      "loss": -0.0279,
+      "num_tokens": 99996831.0,
+      "reward": 0.421875,
+      "reward_std": 0.3916535973548889,
+      "rewards/accuracy_reward/mean": 0.421875,
+      "rewards/accuracy_reward/std": 0.4957992732524872,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999240040779114,
+      "sampling/importance_sampling_ratio/min": 0.02249590866267681,
+      "sampling/sampling_logp_difference/max": 3.794421911239624,
+      "sampling/sampling_logp_difference/mean": 0.01866895705461502,
+      "step": 132
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 3.0998018473837874e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.0998018473837874e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15738.0,
+      "completions/mean_length": 6242.9453125,
+      "completions/mean_terminated_length": 6163.09423828125,
+      "completions/min_length": 1187.0,
+      "completions/min_terminated_length": 1187.0,
+      "entropy": 0.8624134212732315,
+      "epoch": 0.12235510579576817,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0023277695290744305,
+      "learning_rate": 1e-05,
+      "loss": 0.0524,
+      "num_tokens": 100814112.0,
+      "reward": 0.3984375,
+      "reward_std": 0.2869499623775482,
+      "rewards/accuracy_reward/mean": 0.3984375,
+      "rewards/accuracy_reward/std": 0.4915000796318054,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999959409236908,
+      "sampling/importance_sampling_ratio/min": 0.0002393616596236825,
+      "sampling/sampling_logp_difference/max": 8.33753490447998,
+      "sampling/sampling_logp_difference/mean": 0.0191188994795084,
+      "step": 133
+    },
+    {
+      "clip_ratio/high_max": 6.589872555196052e-06,
+      "clip_ratio/high_mean": 1.647468138799013e-06,
+      "clip_ratio/low_mean": 4.329304238126497e-05,
+      "clip_ratio/low_min": 3.5120251595799346e-06,
+      "clip_ratio/region_mean": 4.494051017900347e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14866.0,
+      "completions/mean_length": 5733.6875,
+      "completions/mean_terminated_length": 5478.080078125,
+      "completions/min_length": 789.0,
+      "completions/min_terminated_length": 789.0,
+      "entropy": 0.9628067463636398,
+      "epoch": 0.12327506899724011,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.003547821193933487,
+      "learning_rate": 1e-05,
+      "loss": 0.0321,
+      "num_tokens": 101566264.0,
+      "reward": 0.3984375,
+      "reward_std": 0.36584997177124023,
+      "rewards/accuracy_reward/mean": 0.3984375,
+      "rewards/accuracy_reward/std": 0.4915000796318054,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999400973320007,
+      "sampling/importance_sampling_ratio/min": 0.0001282035664189607,
+      "sampling/sampling_logp_difference/max": 8.961891174316406,
+      "sampling/sampling_logp_difference/mean": 0.019646761938929558,
+      "step": 134
+    },
+    {
+      "clip_ratio/high_max": 1.7107527582993498e-05,
+      "clip_ratio/high_mean": 4.2768818957483745e-06,
+      "clip_ratio/low_mean": 3.014796902789385e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.442485103732906e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15848.0,
+      "completions/max_terminated_length": 15848.0,
+      "completions/mean_length": 5505.9375,
+      "completions/mean_terminated_length": 5505.9375,
+      "completions/min_length": 668.0,
+      "completions/min_terminated_length": 668.0,
+      "entropy": 0.8041045889258385,
+      "epoch": 0.12419503219871206,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0024891747161746025,
+      "learning_rate": 1e-05,
+      "loss": 0.1406,
+      "num_tokens": 102291456.0,
+      "reward": 0.5,
+      "reward_std": 0.35482609272003174,
+      "rewards/accuracy_reward/mean": 0.5,
+      "rewards/accuracy_reward/std": 0.5019646286964417,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999248385429382,
+      "sampling/importance_sampling_ratio/min": 0.0014627616619691253,
+      "sampling/sampling_logp_difference/max": 6.527429103851318,
+      "sampling/sampling_logp_difference/mean": 0.01716250739991665,
+      "step": 135
+    },
+    {
+      "clip_ratio/high_max": 1.548903105685895e-05,
+      "clip_ratio/high_mean": 3.872257764214737e-06,
+      "clip_ratio/low_mean": 5.380711581892683e-05,
+      "clip_ratio/low_min": 4.5777483137499075e-06,
+      "clip_ratio/region_mean": 5.767937363998499e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 16005.0,
+      "completions/max_terminated_length": 16005.0,
+      "completions/mean_length": 5003.0625,
+      "completions/mean_terminated_length": 5003.0625,
+      "completions/min_length": 497.0,
+      "completions/min_terminated_length": 497.0,
+      "entropy": 0.9115714654326439,
+      "epoch": 0.125114995400184,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.00220683915540576,
+      "learning_rate": 1e-05,
+      "loss": 0.1361,
+      "num_tokens": 102949824.0,
+      "reward": 0.4140625,
+      "reward_std": 0.3158818483352661,
+      "rewards/accuracy_reward/mean": 0.4140625,
+      "rewards/accuracy_reward/std": 0.49449479579925537,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999973714351654,
+      "sampling/importance_sampling_ratio/min": 8.323705696966499e-05,
+      "sampling/sampling_logp_difference/max": 9.393817901611328,
+      "sampling/sampling_logp_difference/mean": 0.018076512962579727,
+      "step": 136
+    },
+    {
+      "clip_ratio/high_max": 2.181136096623959e-05,
+      "clip_ratio/high_mean": 5.4528402415598975e-06,
+      "clip_ratio/low_mean": 3.4416837252138066e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.986967681157694e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15658.0,
+      "completions/max_terminated_length": 15658.0,
+      "completions/mean_length": 4742.1328125,
+      "completions/mean_terminated_length": 4742.1328125,
+      "completions/min_length": 462.0,
+      "completions/min_terminated_length": 462.0,
+      "entropy": 0.9430246204137802,
+      "epoch": 0.12603495860165592,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.003964806906878948,
+      "learning_rate": 1e-05,
+      "loss": 0.0215,
+      "num_tokens": 103580913.0,
+      "reward": 0.4609375,
+      "reward_std": 0.2914257347583771,
+      "rewards/accuracy_reward/mean": 0.4609375,
+      "rewards/accuracy_reward/std": 0.5004304051399231,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999952495098114,
+      "sampling/importance_sampling_ratio/min": 7.031940185697749e-05,
+      "sampling/sampling_logp_difference/max": 9.56246280670166,
+      "sampling/sampling_logp_difference/mean": 0.019651200622320175,
+      "step": 137
+    },
+    {
+      "clip_ratio/high_max": 4.07684046876966e-06,
+      "clip_ratio/high_mean": 1.019210117192415e-06,
+      "clip_ratio/low_mean": 3.8682398553646635e-05,
+      "clip_ratio/low_min": 8.189203072106466e-06,
+      "clip_ratio/region_mean": 3.970160832977854e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15944.0,
+      "completions/mean_length": 6574.171875,
+      "completions/mean_terminated_length": 6091.72119140625,
+      "completions/min_length": 10.0,
+      "completions/min_terminated_length": 10.0,
+      "entropy": 0.8429529070854187,
+      "epoch": 0.12695492180312787,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.002067410387098789,
+      "learning_rate": 1e-05,
+      "loss": 0.0377,
+      "num_tokens": 104447463.0,
+      "reward": 0.3125,
+      "reward_std": 0.24511480331420898,
+      "rewards/accuracy_reward/mean": 0.3125,
+      "rewards/accuracy_reward/std": 0.4653336703777313,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9997583627700806,
+      "sampling/importance_sampling_ratio/min": 0.00021258489869069308,
+      "sampling/sampling_logp_difference/max": 8.456169128417969,
+      "sampling/sampling_logp_difference/mean": 0.018853647634387016,
+      "step": 138
+    },
+    {
+      "clip_ratio/high_max": 1.9725823221961036e-05,
+      "clip_ratio/high_mean": 4.931455805490259e-06,
+      "clip_ratio/low_mean": 5.9263072444082354e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 6.419452870431996e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15518.0,
+      "completions/max_terminated_length": 15518.0,
+      "completions/mean_length": 4581.5625,
+      "completions/mean_terminated_length": 4581.5625,
+      "completions/min_length": 301.0,
+      "completions/min_terminated_length": 301.0,
+      "entropy": 0.7094272822141647,
+      "epoch": 0.12787488500459981,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.004292502999305725,
+      "learning_rate": 1e-05,
+      "loss": 0.0946,
+      "num_tokens": 105052287.0,
+      "reward": 0.625,
+      "reward_std": 0.3908300995826721,
+      "rewards/accuracy_reward/mean": 0.625,
+      "rewards/accuracy_reward/std": 0.4860251843929291,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999477863311768,
+      "sampling/importance_sampling_ratio/min": 0.0019342642044648528,
+      "sampling/sampling_logp_difference/max": 6.24802827835083,
+      "sampling/sampling_logp_difference/mean": 0.016310662031173706,
+      "step": 139
+    },
+    {
+      "clip_ratio/high_max": 1.0132298029930098e-05,
+      "clip_ratio/high_mean": 2.5330745074825245e-06,
+      "clip_ratio/low_mean": 4.6397121650443296e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.893019581686531e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16097.0,
+      "completions/mean_length": 7066.4453125,
+      "completions/mean_terminated_length": 6918.5478515625,
+      "completions/min_length": 990.0,
+      "completions/min_terminated_length": 990.0,
+      "entropy": 0.8481669947504997,
+      "epoch": 0.12879484820607176,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0015785128343850374,
+      "learning_rate": 1e-05,
+      "loss": 0.0485,
+      "num_tokens": 105977048.0,
+      "reward": 0.3515625,
+      "reward_std": 0.27328038215637207,
+      "rewards/accuracy_reward/mean": 0.3515625,
+      "rewards/accuracy_reward/std": 0.4793342351913452,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000159740447998,
+      "sampling/importance_sampling_ratio/min": 0.00104097044095397,
+      "sampling/sampling_logp_difference/max": 6.8676018714904785,
+      "sampling/sampling_logp_difference/mean": 0.018304405733942986,
+      "step": 140
+    },
+    {
+      "clip_ratio/high_max": 1.6989023606583942e-05,
+      "clip_ratio/high_mean": 4.2472559016459854e-06,
+      "clip_ratio/low_mean": 2.3075059743860038e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.7322315418132348e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 16104.0,
+      "completions/max_terminated_length": 16104.0,
+      "completions/mean_length": 6230.5234375,
+      "completions/mean_terminated_length": 6230.5234375,
+      "completions/min_length": 220.0,
+      "completions/min_terminated_length": 220.0,
+      "entropy": 0.9658062160015106,
+      "epoch": 0.1297148114075437,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.002542720176279545,
+      "learning_rate": 1e-05,
+      "loss": 0.0725,
+      "num_tokens": 106793187.0,
+      "reward": 0.3203125,
+      "reward_std": 0.3050953149795532,
+      "rewards/accuracy_reward/mean": 0.3203125,
+      "rewards/accuracy_reward/std": 0.4684300124645233,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000169277191162,
+      "sampling/importance_sampling_ratio/min": 0.0002781494113150984,
+      "sampling/sampling_logp_difference/max": 8.187352180480957,
+      "sampling/sampling_logp_difference/mean": 0.019391046836972237,
+      "step": 141
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 2.7597974508353218e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.7597974508353218e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14216.0,
+      "completions/mean_length": 5690.5546875,
+      "completions/mean_terminated_length": 5606.3544921875,
+      "completions/min_length": 1124.0,
+      "completions/min_terminated_length": 1124.0,
+      "entropy": 1.0098655670881271,
+      "epoch": 0.13063477460901565,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.001451602904126048,
+      "learning_rate": 1e-05,
+      "loss": 0.0444,
+      "num_tokens": 107539874.0,
+      "reward": 0.4296875,
+      "reward_std": 0.23304283618927002,
+      "rewards/accuracy_reward/mean": 0.4296875,
+      "rewards/accuracy_reward/std": 0.4969765841960907,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999307990074158,
+      "sampling/importance_sampling_ratio/min": 5.640022671116185e-09,
+      "sampling/sampling_logp_difference/max": 18.993377685546875,
+      "sampling/sampling_logp_difference/mean": 0.018607191741466522,
+      "step": 142
+    },
+    {
+      "clip_ratio/high_max": 1.2800467629858758e-05,
+      "clip_ratio/high_mean": 4.19954119479371e-06,
+      "clip_ratio/low_mean": 2.350350996493944e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.770305115973315e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15791.0,
+      "completions/max_terminated_length": 15791.0,
+      "completions/mean_length": 5471.1328125,
+      "completions/mean_terminated_length": 5471.1328125,
+      "completions/min_length": 4.0,
+      "completions/min_terminated_length": 4.0,
+      "entropy": 1.0413162112236023,
+      "epoch": 0.13155473781048757,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0023549250327050686,
+      "learning_rate": 1e-05,
+      "loss": 0.0411,
+      "num_tokens": 108260091.0,
+      "reward": 0.3203125,
+      "reward_std": 0.30061954259872437,
+      "rewards/accuracy_reward/mean": 0.3203125,
+      "rewards/accuracy_reward/std": 0.4684300124645233,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999832510948181,
+      "sampling/importance_sampling_ratio/min": 0.0011709182290360332,
+      "sampling/sampling_logp_difference/max": 6.749967098236084,
+      "sampling/sampling_logp_difference/mean": 0.020427243784070015,
+      "step": 143
+    },
+    {
+      "clip_ratio/high_max": 2.1983064925734652e-05,
+      "clip_ratio/high_mean": 5.495766231433663e-06,
+      "clip_ratio/low_mean": 4.361141452591255e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.9107180757346214e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16277.0,
+      "completions/mean_length": 6211.7421875,
+      "completions/mean_terminated_length": 6050.2783203125,
+      "completions/min_length": 622.0,
+      "completions/min_terminated_length": 622.0,
+      "entropy": 0.9706784337759018,
+      "epoch": 0.13247470101195952,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0017527056625112891,
+      "learning_rate": 1e-05,
+      "loss": 0.0686,
+      "num_tokens": 109073890.0,
+      "reward": 0.421875,
+      "reward_std": 0.29826050996780396,
+      "rewards/accuracy_reward/mean": 0.421875,
+      "rewards/accuracy_reward/std": 0.4957992732524872,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999092221260071,
+      "sampling/importance_sampling_ratio/min": 0.002898645820096135,
+      "sampling/sampling_logp_difference/max": 5.843511581420898,
+      "sampling/sampling_logp_difference/mean": 0.018898162990808487,
+      "step": 144
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 4.208964992358233e-05,
+      "clip_ratio/low_min": 3.9168990042526275e-06,
+      "clip_ratio/region_mean": 4.208964992358233e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14880.0,
+      "completions/mean_length": 6007.8984375,
+      "completions/mean_terminated_length": 5926.19677734375,
+      "completions/min_length": 156.0,
+      "completions/min_terminated_length": 156.0,
+      "entropy": 1.1967609524726868,
+      "epoch": 0.13339466421343146,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0007858420140109956,
+      "learning_rate": 1e-05,
+      "loss": 0.011,
+      "num_tokens": 109861813.0,
+      "reward": 0.296875,
+      "reward_std": 0.23486506938934326,
+      "rewards/accuracy_reward/mean": 0.296875,
+      "rewards/accuracy_reward/std": 0.45867621898651123,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999340772628784,
+      "sampling/importance_sampling_ratio/min": 3.294382011631569e-08,
+      "sampling/sampling_logp_difference/max": 17.22846221923828,
+      "sampling/sampling_logp_difference/mean": 0.021845955401659012,
+      "step": 145
+    },
+    {
+      "clip_ratio/high_max": 4.5118208618077915e-06,
+      "clip_ratio/high_mean": 1.1279552154519479e-06,
+      "clip_ratio/low_mean": 3.749712686840212e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.8625082197540905e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15838.0,
+      "completions/mean_length": 6800.9921875,
+      "completions/mean_terminated_length": 6725.53564453125,
+      "completions/min_length": 5.0,
+      "completions/min_terminated_length": 5.0,
+      "entropy": 1.0437887012958527,
+      "epoch": 0.1343146274149034,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0029428249690681696,
+      "learning_rate": 1e-05,
+      "loss": 0.0405,
+      "num_tokens": 110756572.0,
+      "reward": 0.265625,
+      "reward_std": 0.3248382806777954,
+      "rewards/accuracy_reward/mean": 0.265625,
+      "rewards/accuracy_reward/std": 0.44340085983276367,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999890327453613,
+      "sampling/importance_sampling_ratio/min": 0.0006329434108920395,
+      "sampling/sampling_logp_difference/max": 7.365129470825195,
+      "sampling/sampling_logp_difference/mean": 0.02010120078921318,
+      "step": 146
+    },
+    {
+      "clip_ratio/high_max": 1.427700522071973e-05,
+      "clip_ratio/high_mean": 3.5692513051799324e-06,
+      "clip_ratio/low_mean": 4.964020990883e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.320946092979284e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15834.0,
+      "completions/mean_length": 6309.4453125,
+      "completions/mean_terminated_length": 6230.1181640625,
+      "completions/min_length": 283.0,
+      "completions/min_terminated_length": 283.0,
+      "entropy": 0.9768906533718109,
+      "epoch": 0.13523459061637536,
+      "frac_reward_zero_std": 0.125,
+      "grad_norm": 0.002088683657348156,
+      "learning_rate": 1e-05,
+      "loss": 0.0316,
+      "num_tokens": 111585493.0,
+      "reward": 0.375,
+      "reward_std": 0.39796435832977295,
+      "rewards/accuracy_reward/mean": 0.375,
+      "rewards/accuracy_reward/std": 0.4860251843929291,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.000007152557373,
+      "sampling/importance_sampling_ratio/min": 0.009723234921693802,
+      "sampling/sampling_logp_difference/max": 4.633236885070801,
+      "sampling/sampling_logp_difference/mean": 0.020927833393216133,
+      "step": 147
+    },
+    {
+      "clip_ratio/high_max": 5.4841398196003865e-06,
+      "clip_ratio/high_mean": 1.3710349549000966e-06,
+      "clip_ratio/low_mean": 5.122006064084417e-05,
+      "clip_ratio/low_min": 3.785125954891555e-06,
+      "clip_ratio/region_mean": 5.25910957094311e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15209.0,
+      "completions/mean_length": 6221.859375,
+      "completions/mean_terminated_length": 6060.5556640625,
+      "completions/min_length": 83.0,
+      "completions/min_terminated_length": 83.0,
+      "entropy": 0.9212924689054489,
+      "epoch": 0.13615455381784727,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.002406956860795617,
+      "learning_rate": 1e-05,
+      "loss": 0.1051,
+      "num_tokens": 112400363.0,
+      "reward": 0.40625,
+      "reward_std": 0.31929677724838257,
+      "rewards/accuracy_reward/mean": 0.40625,
+      "rewards/accuracy_reward/std": 0.4930621087551117,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999701976776123,
+      "sampling/importance_sampling_ratio/min": 5.8308287407271564e-05,
+      "sampling/sampling_logp_difference/max": 9.74976634979248,
+      "sampling/sampling_logp_difference/mean": 0.018652018159627914,
+      "step": 148
+    },
+    {
+      "clip_ratio/high_max": 1.4568151755156578e-05,
+      "clip_ratio/high_mean": 3.6420379387891444e-06,
+      "clip_ratio/low_mean": 3.999794398623635e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.3639981413434725e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14997.0,
+      "completions/mean_length": 6942.8203125,
+      "completions/mean_terminated_length": 6716.232421875,
+      "completions/min_length": 200.0,
+      "completions/min_terminated_length": 200.0,
+      "entropy": 0.949538916349411,
+      "epoch": 0.13707451701931922,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0022962254006415606,
+      "learning_rate": 1e-05,
+      "loss": 0.0625,
+      "num_tokens": 113308748.0,
+      "reward": 0.375,
+      "reward_std": 0.3329663872718811,
+      "rewards/accuracy_reward/mean": 0.375,
+      "rewards/accuracy_reward/std": 0.4860251843929291,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999334812164307,
+      "sampling/importance_sampling_ratio/min": 0.00048810525913722813,
+      "sampling/sampling_logp_difference/max": 7.624979496002197,
+      "sampling/sampling_logp_difference/mean": 0.01939917355775833,
+      "step": 149
+    },
+    {
+      "clip_ratio/high_max": 8.786732450971613e-06,
+      "clip_ratio/high_mean": 2.196683112742903e-06,
+      "clip_ratio/low_mean": 5.562954720517155e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.7826231113722315e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15182.0,
+      "completions/mean_length": 6783.1796875,
+      "completions/mean_terminated_length": 6552.76025390625,
+      "completions/min_length": 7.0,
+      "completions/min_terminated_length": 7.0,
+      "entropy": 0.9774708449840546,
+      "epoch": 0.13799448022079117,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0020560629200190306,
+      "learning_rate": 1e-05,
+      "loss": 0.0473,
+      "num_tokens": 114196235.0,
+      "reward": 0.34375,
+      "reward_std": 0.3066929578781128,
+      "rewards/accuracy_reward/mean": 0.34375,
+      "rewards/accuracy_reward/std": 0.47682511806488037,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998990297317505,
+      "sampling/importance_sampling_ratio/min": 2.4757892447269114e-07,
+      "sampling/sampling_logp_difference/max": 15.211536407470703,
+      "sampling/sampling_logp_difference/mean": 0.019691556692123413,
+      "step": 150
+    },
+    {
+      "clip_ratio/high_max": 1.799483243303257e-05,
+      "clip_ratio/high_mean": 4.498708108258143e-06,
+      "clip_ratio/low_mean": 2.6389980291696702e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.0888688343111426e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15549.0,
+      "completions/mean_length": 5568.15625,
+      "completions/mean_terminated_length": 5396.4765625,
+      "completions/min_length": 271.0,
+      "completions/min_terminated_length": 271.0,
+      "entropy": 0.9303529411554337,
+      "epoch": 0.1389144434222631,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0022214846685528755,
+      "learning_rate": 1e-05,
+      "loss": 0.0187,
+      "num_tokens": 114928047.0,
+      "reward": 0.234375,
+      "reward_std": 0.2585597634315491,
+      "rewards/accuracy_reward/mean": 0.234375,
+      "rewards/accuracy_reward/std": 0.42527204751968384,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999408721923828,
+      "sampling/importance_sampling_ratio/min": 2.1446083337650634e-05,
+      "sampling/sampling_logp_difference/max": 10.749968528747559,
+      "sampling/sampling_logp_difference/mean": 0.01938418298959732,
+      "step": 151
+    },
+    {
+      "clip_ratio/high_max": 1.1957493370573502e-05,
+      "clip_ratio/high_mean": 2.9893733426433755e-06,
+      "clip_ratio/low_mean": 5.885063319510664e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 6.184000585562899e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15340.0,
+      "completions/max_terminated_length": 15340.0,
+      "completions/mean_length": 6086.578125,
+      "completions/mean_terminated_length": 6086.578125,
+      "completions/min_length": 919.0,
+      "completions/min_terminated_length": 919.0,
+      "entropy": 0.9131873697042465,
+      "epoch": 0.13983440662373506,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.002448044717311859,
+      "learning_rate": 1e-05,
+      "loss": 0.0599,
+      "num_tokens": 115725657.0,
+      "reward": 0.40625,
+      "reward_std": 0.35878273844718933,
+      "rewards/accuracy_reward/mean": 0.40625,
+      "rewards/accuracy_reward/std": 0.4930621087551117,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999779462814331,
+      "sampling/importance_sampling_ratio/min": 0.02929726243019104,
+      "sampling/sampling_logp_difference/max": 3.530261278152466,
+      "sampling/sampling_logp_difference/mean": 0.019298439845442772,
+      "step": 152
+    },
+    {
+      "clip_ratio/high_max": 1.3385357760853367e-05,
+      "clip_ratio/high_mean": 3.3463394402133417e-06,
+      "clip_ratio/low_mean": 5.717015119444113e-05,
+      "clip_ratio/low_min": 3.4328400033700746e-06,
+      "clip_ratio/region_mean": 6.0516490520967636e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15987.0,
+      "completions/mean_length": 6442.5390625,
+      "completions/mean_terminated_length": 6203.9443359375,
+      "completions/min_length": 574.0,
+      "completions/min_terminated_length": 574.0,
+      "entropy": 0.8959419652819633,
+      "epoch": 0.140754369825207,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.002013204852119088,
+      "learning_rate": 1e-05,
+      "loss": 0.0281,
+      "num_tokens": 116571478.0,
+      "reward": 0.2734375,
+      "reward_std": 0.2909066081047058,
+      "rewards/accuracy_reward/mean": 0.2734375,
+      "rewards/accuracy_reward/std": 0.447474867105484,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.000044584274292,
+      "sampling/importance_sampling_ratio/min": 1.0374163821325055e-06,
+      "sampling/sampling_logp_difference/max": 13.778777122497559,
+      "sampling/sampling_logp_difference/mean": 0.01925014518201351,
+      "step": 153
+    },
+    {
+      "clip_ratio/high_max": 9.34224021875707e-06,
+      "clip_ratio/high_mean": 3.136903728773177e-06,
+      "clip_ratio/low_mean": 2.9738095065567904e-05,
+      "clip_ratio/low_min": 3.7240065466903616e-06,
+      "clip_ratio/region_mean": 3.2874999135401595e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15946.0,
+      "completions/mean_length": 6633.5703125,
+      "completions/mean_terminated_length": 6319.0400390625,
+      "completions/min_length": 358.0,
+      "completions/min_terminated_length": 358.0,
+      "entropy": 1.0223619118332863,
+      "epoch": 0.14167433302667892,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0024523327592760324,
+      "learning_rate": 1e-05,
+      "loss": 0.056,
+      "num_tokens": 117440743.0,
+      "reward": 0.3203125,
+      "reward_std": 0.30061954259872437,
+      "rewards/accuracy_reward/mean": 0.3203125,
+      "rewards/accuracy_reward/std": 0.4684300124645233,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999213218688965,
+      "sampling/importance_sampling_ratio/min": 3.0026931199245155e-05,
+      "sampling/sampling_logp_difference/max": 10.413415908813477,
+      "sampling/sampling_logp_difference/mean": 0.02061290666460991,
+      "step": 154
+    },
+    {
+      "clip_ratio/high_max": 1.4537483366439119e-05,
+      "clip_ratio/high_mean": 3.6343708416097797e-06,
+      "clip_ratio/low_mean": 3.954866042477079e-05,
+      "clip_ratio/low_min": 9.874949228105834e-06,
+      "clip_ratio/region_mean": 4.318303126638057e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15919.0,
+      "completions/mean_length": 7183.0,
+      "completions/mean_terminated_length": 6886.193359375,
+      "completions/min_length": 357.0,
+      "completions/min_terminated_length": 357.0,
+      "entropy": 0.9815369099378586,
+      "epoch": 0.14259429622815087,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0018688985146582127,
+      "learning_rate": 1e-05,
+      "loss": 0.0395,
+      "num_tokens": 118380687.0,
+      "reward": 0.2890625,
+      "reward_std": 0.2498900145292282,
+      "rewards/accuracy_reward/mean": 0.2890625,
+      "rewards/accuracy_reward/std": 0.45510825514793396,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999039173126221,
+      "sampling/importance_sampling_ratio/min": 1.3847662557964213e-05,
+      "sampling/sampling_logp_difference/max": 11.187394142150879,
+      "sampling/sampling_logp_difference/mean": 0.019792160019278526,
+      "step": 155
+    },
+    {
+      "clip_ratio/high_max": 7.165636361605721e-06,
+      "clip_ratio/high_mean": 1.7914090904014301e-06,
+      "clip_ratio/low_mean": 4.9011068711024564e-05,
+      "clip_ratio/low_min": 1.0991705721608014e-05,
+      "clip_ratio/region_mean": 5.0802477687739156e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16246.0,
+      "completions/mean_length": 6324.640625,
+      "completions/mean_terminated_length": 5829.91748046875,
+      "completions/min_length": 45.0,
+      "completions/min_terminated_length": 45.0,
+      "entropy": 0.852975606918335,
+      "epoch": 0.14351425942962281,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.002005894435569644,
+      "learning_rate": 1e-05,
+      "loss": 0.0041,
+      "num_tokens": 119207089.0,
+      "reward": 0.3984375,
+      "reward_std": 0.2858940362930298,
+      "rewards/accuracy_reward/mean": 0.3984375,
+      "rewards/accuracy_reward/std": 0.4915000796318054,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000035762786865,
+      "sampling/importance_sampling_ratio/min": 5.788659223071591e-07,
+      "sampling/sampling_logp_difference/max": 14.362195014953613,
+      "sampling/sampling_logp_difference/mean": 0.01853565312922001,
+      "step": 156
+    },
+    {
+      "clip_ratio/high_max": 7.795394822096569e-06,
+      "clip_ratio/high_mean": 1.948848705524142e-06,
+      "clip_ratio/low_mean": 3.834237736555224e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.0291225786859286e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16281.0,
+      "completions/mean_length": 5723.421875,
+      "completions/mean_terminated_length": 5290.06494140625,
+      "completions/min_length": 355.0,
+      "completions/min_terminated_length": 355.0,
+      "entropy": 0.8744911625981331,
+      "epoch": 0.14443422263109476,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.002577397273853421,
+      "learning_rate": 1e-05,
+      "loss": 0.0603,
+      "num_tokens": 119961895.0,
+      "reward": 0.390625,
+      "reward_std": 0.34321609139442444,
+      "rewards/accuracy_reward/mean": 0.390625,
+      "rewards/accuracy_reward/std": 0.4898075461387634,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999703764915466,
+      "sampling/importance_sampling_ratio/min": 0.07882421463727951,
+      "sampling/sampling_logp_difference/max": 2.5405349731445312,
+      "sampling/sampling_logp_difference/mean": 0.018341556191444397,
+      "step": 157
+    },
+    {
+      "clip_ratio/high_max": 9.214097190124448e-06,
+      "clip_ratio/high_mean": 2.303524297531112e-06,
+      "clip_ratio/low_mean": 2.636873176697918e-05,
+      "clip_ratio/low_min": 2.9339967113628518e-06,
+      "clip_ratio/region_mean": 2.8672255837136618e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16055.0,
+      "completions/mean_length": 7886.015625,
+      "completions/mean_terminated_length": 7682.064453125,
+      "completions/min_length": 989.0,
+      "completions/min_terminated_length": 989.0,
+      "entropy": 0.9391767829656601,
+      "epoch": 0.1453541858325667,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.002552987542003393,
+      "learning_rate": 1e-05,
+      "loss": 0.0273,
+      "num_tokens": 120990289.0,
+      "reward": 0.328125,
+      "reward_std": 0.2522490322589874,
+      "rewards/accuracy_reward/mean": 0.328125,
+      "rewards/accuracy_reward/std": 0.4713755249977112,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000030994415283,
+      "sampling/importance_sampling_ratio/min": 0.000899312668479979,
+      "sampling/sampling_logp_difference/max": 7.013879776000977,
+      "sampling/sampling_logp_difference/mean": 0.02049873024225235,
+      "step": 158
+    },
+    {
+      "clip_ratio/high_max": 3.406416203688423e-05,
+      "clip_ratio/high_mean": 9.72330332160709e-06,
+      "clip_ratio/low_mean": 3.168332909808669e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.140663151019908e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16276.0,
+      "completions/mean_length": 6173.1640625,
+      "completions/mean_terminated_length": 6011.087890625,
+      "completions/min_length": 445.0,
+      "completions/min_terminated_length": 445.0,
+      "entropy": 0.9148785546422005,
+      "epoch": 0.14627414903403863,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.002678362652659416,
+      "learning_rate": 1e-05,
+      "loss": 0.039,
+      "num_tokens": 121797958.0,
+      "reward": 0.4140625,
+      "reward_std": 0.3608373999595642,
+      "rewards/accuracy_reward/mean": 0.4140625,
+      "rewards/accuracy_reward/std": 0.49449479579925537,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999265074729919,
+      "sampling/importance_sampling_ratio/min": 0.002013920107856393,
+      "sampling/sampling_logp_difference/max": 6.207672119140625,
+      "sampling/sampling_logp_difference/mean": 0.018977735191583633,
+      "step": 159
+    },
+    {
+      "clip_ratio/high_max": 1.8476588593330234e-05,
+      "clip_ratio/high_mean": 4.6191471483325586e-06,
+      "clip_ratio/low_mean": 4.459614581264759e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.9215293188353826e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15825.0,
+      "completions/mean_length": 6594.21875,
+      "completions/mean_terminated_length": 6196.259765625,
+      "completions/min_length": 196.0,
+      "completions/min_terminated_length": 196.0,
+      "entropy": 0.9486038386821747,
+      "epoch": 0.14719411223551057,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0033711253199726343,
+      "learning_rate": 1e-05,
+      "loss": 0.026,
+      "num_tokens": 122661170.0,
+      "reward": 0.3828125,
+      "reward_std": 0.30457615852355957,
+      "rewards/accuracy_reward/mean": 0.3828125,
+      "rewards/accuracy_reward/std": 0.4879830479621887,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998981356620789,
+      "sampling/importance_sampling_ratio/min": 0.0002968419576063752,
+      "sampling/sampling_logp_difference/max": 8.122310638427734,
+      "sampling/sampling_logp_difference/mean": 0.01938377134501934,
+      "step": 160
+    },
+    {
+      "clip_ratio/high_max": 7.97335997049231e-06,
+      "clip_ratio/high_mean": 2.7343705824023345e-06,
+      "clip_ratio/low_mean": 5.420079878604156e-05,
+      "clip_ratio/low_min": 4.594068286678521e-06,
+      "clip_ratio/region_mean": 5.693517005056492e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15928.0,
+      "completions/mean_length": 6533.9453125,
+      "completions/mean_terminated_length": 6377.595703125,
+      "completions/min_length": 524.0,
+      "completions/min_terminated_length": 524.0,
+      "entropy": 0.9986584335565567,
+      "epoch": 0.14811407543698252,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0017857529455795884,
+      "learning_rate": 1e-05,
+      "loss": 0.0804,
+      "num_tokens": 123518107.0,
+      "reward": 0.34375,
+      "reward_std": 0.3356248140335083,
+      "rewards/accuracy_reward/mean": 0.34375,
+      "rewards/accuracy_reward/std": 0.47682511806488037,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998549818992615,
+      "sampling/importance_sampling_ratio/min": 9.012701411847956e-06,
+      "sampling/sampling_logp_difference/max": 11.616875648498535,
+      "sampling/sampling_logp_difference/mean": 0.02010391652584076,
+      "step": 161
+    },
+    {
+      "clip_ratio/high_max": 4.470512521947967e-06,
+      "clip_ratio/high_mean": 1.1176281304869917e-06,
+      "clip_ratio/low_mean": 3.5141094485879876e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.625872295742738e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 13212.0,
+      "completions/mean_length": 5742.21875,
+      "completions/mean_terminated_length": 5658.42529296875,
+      "completions/min_length": 235.0,
+      "completions/min_terminated_length": 235.0,
+      "entropy": 1.0379670709371567,
+      "epoch": 0.14903403863845446,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0018227624241262674,
+      "learning_rate": 1e-05,
+      "loss": -0.0237,
+      "num_tokens": 124279031.0,
+      "reward": 0.21875,
+      "reward_std": 0.2869548797607422,
+      "rewards/accuracy_reward/mean": 0.21875,
+      "rewards/accuracy_reward/std": 0.41502299904823303,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998506903648376,
+      "sampling/importance_sampling_ratio/min": 0.0020977305248379707,
+      "sampling/sampling_logp_difference/max": 6.16689920425415,
+      "sampling/sampling_logp_difference/mean": 0.019987668842077255,
+      "step": 162
+    },
+    {
+      "clip_ratio/high_max": 1.0003542683989508e-05,
+      "clip_ratio/high_mean": 3.21091931709816e-06,
+      "clip_ratio/low_mean": 5.731009014198207e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 6.0521009800140746e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16354.0,
+      "completions/mean_length": 7584.703125,
+      "completions/mean_terminated_length": 7515.41748046875,
+      "completions/min_length": 884.0,
+      "completions/min_terminated_length": 884.0,
+      "entropy": 0.953459307551384,
+      "epoch": 0.1499540018399264,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.002219022251665592,
+      "learning_rate": 1e-05,
+      "loss": 0.0837,
+      "num_tokens": 125270761.0,
+      "reward": 0.359375,
+      "reward_std": 0.37033066153526306,
+      "rewards/accuracy_reward/mean": 0.359375,
+      "rewards/accuracy_reward/std": 0.481702595949173,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999880790710449,
+      "sampling/importance_sampling_ratio/min": 0.0024849213659763336,
+      "sampling/sampling_logp_difference/max": 5.997514247894287,
+      "sampling/sampling_logp_difference/mean": 0.020291510969400406,
+      "step": 163
+    },
+    {
+      "clip_ratio/high_max": 7.734669452474918e-06,
+      "clip_ratio/high_mean": 1.9336673631187296e-06,
+      "clip_ratio/low_mean": 3.1135301298945706e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.3068968605221016e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16133.0,
+      "completions/mean_length": 4714.671875,
+      "completions/mean_terminated_length": 4622.78759765625,
+      "completions/min_length": 371.0,
+      "completions/min_terminated_length": 371.0,
+      "entropy": 1.018719919025898,
+      "epoch": 0.15087396504139836,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0014189074281603098,
+      "learning_rate": 1e-05,
+      "loss": 0.0501,
+      "num_tokens": 125895279.0,
+      "reward": 0.3984375,
+      "reward_std": 0.28383445739746094,
+      "rewards/accuracy_reward/mean": 0.3984375,
+      "rewards/accuracy_reward/std": 0.4915000796318054,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999479651451111,
+      "sampling/importance_sampling_ratio/min": 4.017410901724361e-07,
+      "sampling/sampling_logp_difference/max": 14.727458000183105,
+      "sampling/sampling_logp_difference/mean": 0.018739396706223488,
+      "step": 164
+    },
+    {
+      "clip_ratio/high_max": 1.0069575182569679e-05,
+      "clip_ratio/high_mean": 2.5173937956424197e-06,
+      "clip_ratio/low_mean": 3.824179225375701e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.0759185367278405e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15913.0,
+      "completions/mean_length": 6316.140625,
+      "completions/mean_terminated_length": 6074.51220703125,
+      "completions/min_length": 751.0,
+      "completions/min_terminated_length": 751.0,
+      "entropy": 0.9325072392821312,
+      "epoch": 0.15179392824287027,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.001702460227534175,
+      "learning_rate": 1e-05,
+      "loss": 0.1007,
+      "num_tokens": 126722881.0,
+      "reward": 0.4609375,
+      "reward_std": 0.3056321144104004,
+      "rewards/accuracy_reward/mean": 0.4609375,
+      "rewards/accuracy_reward/std": 0.5004304051399231,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999539852142334,
+      "sampling/importance_sampling_ratio/min": 0.0012551364488899708,
+      "sampling/sampling_logp_difference/max": 6.680510997772217,
+      "sampling/sampling_logp_difference/mean": 0.01929408684372902,
+      "step": 165
+    },
+    {
+      "clip_ratio/high_max": 6.873041002108948e-06,
+      "clip_ratio/high_mean": 1.718260250527237e-06,
+      "clip_ratio/low_mean": 3.119859468370123e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.291685527528898e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15832.0,
+      "completions/mean_length": 4687.140625,
+      "completions/mean_terminated_length": 4595.03955078125,
+      "completions/min_length": 310.0,
+      "completions/min_terminated_length": 310.0,
+      "entropy": 1.0886607319116592,
+      "epoch": 0.15271389144434222,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0032931750174611807,
+      "learning_rate": 1e-05,
+      "loss": 0.0078,
+      "num_tokens": 127341715.0,
+      "reward": 0.28125,
+      "reward_std": 0.2835350036621094,
+      "rewards/accuracy_reward/mean": 0.28125,
+      "rewards/accuracy_reward/std": 0.4513758420944214,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999821186065674,
+      "sampling/importance_sampling_ratio/min": 0.0019364450126886368,
+      "sampling/sampling_logp_difference/max": 6.246901512145996,
+      "sampling/sampling_logp_difference/mean": 0.020621225237846375,
+      "step": 166
+    },
+    {
+      "clip_ratio/high_max": 1.773085250533768e-05,
+      "clip_ratio/high_mean": 4.43271312633442e-06,
+      "clip_ratio/low_mean": 4.30743207289197e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.7507033741567284e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14125.0,
+      "completions/mean_length": 5705.515625,
+      "completions/mean_terminated_length": 5449.232421875,
+      "completions/min_length": 4.0,
+      "completions/min_terminated_length": 4.0,
+      "entropy": 1.0523068830370903,
+      "epoch": 0.15363385464581417,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0031696646474301815,
+      "learning_rate": 1e-05,
+      "loss": -0.0414,
+      "num_tokens": 128093597.0,
+      "reward": 0.1953125,
+      "reward_std": 0.21778053045272827,
+      "rewards/accuracy_reward/mean": 0.1953125,
+      "rewards/accuracy_reward/std": 0.3979988098144531,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999619126319885,
+      "sampling/importance_sampling_ratio/min": 3.197810656274669e-05,
+      "sampling/sampling_logp_difference/max": 10.350459098815918,
+      "sampling/sampling_logp_difference/mean": 0.021961934864521027,
+      "step": 167
+    },
+    {
+      "clip_ratio/high_max": 1.885905066956184e-05,
+      "clip_ratio/high_mean": 4.71476266739046e-06,
+      "clip_ratio/low_mean": 5.0530389898995054e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.524515336219338e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15958.0,
+      "completions/mean_length": 6214.4921875,
+      "completions/mean_terminated_length": 6053.07177734375,
+      "completions/min_length": 533.0,
+      "completions/min_terminated_length": 533.0,
+      "entropy": 0.9371421113610268,
+      "epoch": 0.1545538178472861,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0023704832419753075,
+      "learning_rate": 1e-05,
+      "loss": 0.075,
+      "num_tokens": 128906948.0,
+      "reward": 0.40625,
+      "reward_std": 0.34139877557754517,
+      "rewards/accuracy_reward/mean": 0.40625,
+      "rewards/accuracy_reward/std": 0.4930621087551117,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.000023365020752,
+      "sampling/importance_sampling_ratio/min": 0.0003354824730195105,
+      "sampling/sampling_logp_difference/max": 7.999940872192383,
+      "sampling/sampling_logp_difference/mean": 0.01882763020694256,
+      "step": 168
+    },
+    {
+      "clip_ratio/high_max": 3.042072216885572e-05,
+      "clip_ratio/high_mean": 7.60518054221393e-06,
+      "clip_ratio/low_mean": 4.5897569179942366e-05,
+      "clip_ratio/low_min": 8.727477506909054e-06,
+      "clip_ratio/region_mean": 5.3502750233747065e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15865.0,
+      "completions/mean_length": 7127.0703125,
+      "completions/mean_terminated_length": 7054.18115234375,
+      "completions/min_length": 402.0,
+      "completions/min_terminated_length": 402.0,
+      "entropy": 0.9854387491941452,
+      "epoch": 0.15547378104875806,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.003370177699252963,
+      "learning_rate": 1e-05,
+      "loss": 0.1197,
+      "num_tokens": 129839813.0,
+      "reward": 0.359375,
+      "reward_std": 0.3329663574695587,
+      "rewards/accuracy_reward/mean": 0.359375,
+      "rewards/accuracy_reward/std": 0.481702595949173,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999907910823822,
+      "sampling/importance_sampling_ratio/min": 1.077816432371037e-05,
+      "sampling/sampling_logp_difference/max": 11.43798828125,
+      "sampling/sampling_logp_difference/mean": 0.019736800342798233,
+      "step": 169
+    },
+    {
+      "clip_ratio/high_max": 2.1401074718596647e-05,
+      "clip_ratio/high_mean": 6.243764005375851e-06,
+      "clip_ratio/low_mean": 3.2797592325550795e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.904135610355297e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15865.0,
+      "completions/mean_length": 6566.2890625,
+      "completions/mean_terminated_length": 6330.6640625,
+      "completions/min_length": 969.0,
+      "completions/min_terminated_length": 969.0,
+      "entropy": 0.7978609576821327,
+      "epoch": 0.15639374425023,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0026055986527353525,
+      "learning_rate": 1e-05,
+      "loss": 0.0661,
+      "num_tokens": 130698370.0,
+      "reward": 0.5,
+      "reward_std": 0.36295419931411743,
+      "rewards/accuracy_reward/mean": 0.5,
+      "rewards/accuracy_reward/std": 0.5019646286964417,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999133944511414,
+      "sampling/importance_sampling_ratio/min": 0.00031152591691352427,
+      "sampling/sampling_logp_difference/max": 8.074028015136719,
+      "sampling/sampling_logp_difference/mean": 0.01787097379565239,
+      "step": 170
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 2.0564424403346493e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.0564424403346493e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15576.0,
+      "completions/max_terminated_length": 15576.0,
+      "completions/mean_length": 7186.2890625,
+      "completions/mean_terminated_length": 7186.2890625,
+      "completions/min_length": 351.0,
+      "completions/min_terminated_length": 351.0,
+      "entropy": 1.0232757329940796,
+      "epoch": 0.15731370745170192,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.0023866184055805206,
+      "learning_rate": 1e-05,
+      "loss": 0.0683,
+      "num_tokens": 131637439.0,
+      "reward": 0.2734375,
+      "reward_std": 0.2059282809495926,
+      "rewards/accuracy_reward/mean": 0.2734375,
+      "rewards/accuracy_reward/std": 0.447474867105484,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999207258224487,
+      "sampling/importance_sampling_ratio/min": 0.0007378471200354397,
+      "sampling/sampling_logp_difference/max": 7.211773872375488,
+      "sampling/sampling_logp_difference/mean": 0.02137116715312004,
+      "step": 171
+    },
+    {
+      "clip_ratio/high_max": 4.037900725961663e-05,
+      "clip_ratio/high_mean": 1.0094751814904157e-05,
+      "clip_ratio/low_mean": 5.8380828136250784e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 6.847557995115494e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 13638.0,
+      "completions/mean_length": 5591.5703125,
+      "completions/mean_terminated_length": 5420.26220703125,
+      "completions/min_length": 635.0,
+      "completions/min_terminated_length": 635.0,
+      "entropy": 0.9335208311676979,
+      "epoch": 0.15823367065317387,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.003491115989163518,
+      "learning_rate": 1e-05,
+      "loss": 0.0699,
+      "num_tokens": 132371816.0,
+      "reward": 0.5,
+      "reward_std": 0.3406373858451843,
+      "rewards/accuracy_reward/mean": 0.5,
+      "rewards/accuracy_reward/std": 0.5019646286964417,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999891459941864,
+      "sampling/importance_sampling_ratio/min": 0.00012356207298580557,
+      "sampling/sampling_logp_difference/max": 8.998766899108887,
+      "sampling/sampling_logp_difference/mean": 0.018760837614536285,
+      "step": 172
+    },
+    {
+      "clip_ratio/high_max": 2.8378776733006816e-06,
+      "clip_ratio/high_mean": 7.094694183251704e-07,
+      "clip_ratio/low_mean": 4.4085751369493664e-05,
+      "clip_ratio/low_min": 6.7955093072669115e-06,
+      "clip_ratio/region_mean": 4.4795220674132e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16302.0,
+      "completions/mean_length": 7152.3828125,
+      "completions/mean_terminated_length": 6930.82421875,
+      "completions/min_length": 281.0,
+      "completions/min_terminated_length": 281.0,
+      "entropy": 1.1329835206270218,
+      "epoch": 0.15915363385464582,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.002830669516697526,
+      "learning_rate": 1e-05,
+      "loss": 0.0526,
+      "num_tokens": 133307297.0,
+      "reward": 0.28125,
+      "reward_std": 0.28801077604293823,
+      "rewards/accuracy_reward/mean": 0.28125,
+      "rewards/accuracy_reward/std": 0.4513758420944214,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999501705169678,
+      "sampling/importance_sampling_ratio/min": 0.00028047082014381886,
+      "sampling/sampling_logp_difference/max": 8.179040908813477,
+      "sampling/sampling_logp_difference/mean": 0.021548541262745857,
+      "step": 173
+    },
+    {
+      "clip_ratio/high_max": 1.0150829439226072e-05,
+      "clip_ratio/high_mean": 2.537707359806518e-06,
+      "clip_ratio/low_mean": 3.4009618616437365e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.654732597624388e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15068.0,
+      "completions/mean_length": 7263.453125,
+      "completions/mean_terminated_length": 7118.68310546875,
+      "completions/min_length": 352.0,
+      "completions/min_terminated_length": 352.0,
+      "entropy": 1.092760555446148,
+      "epoch": 0.16007359705611776,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0027821618132293224,
+      "learning_rate": 1e-05,
+      "loss": 0.0541,
+      "num_tokens": 134260107.0,
+      "reward": 0.3203125,
+      "reward_std": 0.2858940362930298,
+      "rewards/accuracy_reward/mean": 0.3203125,
+      "rewards/accuracy_reward/std": 0.4684300124645233,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999946117401123,
+      "sampling/importance_sampling_ratio/min": 7.832317351130769e-05,
+      "sampling/sampling_logp_difference/max": 9.454667091369629,
+      "sampling/sampling_logp_difference/mean": 0.022098438814282417,
+      "step": 174
+    },
+    {
+      "clip_ratio/high_max": 1.0561876024439698e-05,
+      "clip_ratio/high_mean": 2.6404690061099245e-06,
+      "clip_ratio/low_mean": 1.6864279416495265e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 1.9504748649978865e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15388.0,
+      "completions/mean_length": 7088.8125,
+      "completions/mean_terminated_length": 6710.958984375,
+      "completions/min_length": 1314.0,
+      "completions/min_terminated_length": 1314.0,
+      "entropy": 1.0669445469975471,
+      "epoch": 0.1609935602575897,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.0007076738984324038,
+      "learning_rate": 1e-05,
+      "loss": -0.0197,
+      "num_tokens": 135186139.0,
+      "reward": 0.328125,
+      "reward_std": 0.20593319833278656,
+      "rewards/accuracy_reward/mean": 0.328125,
+      "rewards/accuracy_reward/std": 0.4713755249977112,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998199343681335,
+      "sampling/importance_sampling_ratio/min": 3.084653872065246e-05,
+      "sampling/sampling_logp_difference/max": 10.386486053466797,
+      "sampling/sampling_logp_difference/mean": 0.020075790584087372,
+      "step": 175
+    },
+    {
+      "clip_ratio/high_max": 7.095016371749807e-06,
+      "clip_ratio/high_mean": 1.7737540929374518e-06,
+      "clip_ratio/low_mean": 2.7592465016823553e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.936621888238733e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15626.0,
+      "completions/max_terminated_length": 15626.0,
+      "completions/mean_length": 5352.734375,
+      "completions/mean_terminated_length": 5352.734375,
+      "completions/min_length": 333.0,
+      "completions/min_terminated_length": 333.0,
+      "entropy": 1.0387161895632744,
+      "epoch": 0.16191352345906163,
+      "frac_reward_zero_std": 0.125,
+      "grad_norm": 0.0022445612121373415,
+      "learning_rate": 1e-05,
+      "loss": 0.0261,
+      "num_tokens": 135888929.0,
+      "reward": 0.4765625,
+      "reward_std": 0.399257630109787,
+      "rewards/accuracy_reward/mean": 0.4765625,
+      "rewards/accuracy_reward/std": 0.5014128684997559,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999054670333862,
+      "sampling/importance_sampling_ratio/min": 0.00032565294532105327,
+      "sampling/sampling_logp_difference/max": 8.029678344726562,
+      "sampling/sampling_logp_difference/mean": 0.02010166086256504,
+      "step": 176
+    },
+    {
+      "clip_ratio/high_max": 1.5100852124305675e-05,
+      "clip_ratio/high_mean": 4.426987970873597e-06,
+      "clip_ratio/low_mean": 2.7625993425317574e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.2052981168817496e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16266.0,
+      "completions/mean_length": 7758.90625,
+      "completions/mean_terminated_length": 7408.29248046875,
+      "completions/min_length": 742.0,
+      "completions/min_terminated_length": 742.0,
+      "entropy": 1.0648984238505363,
+      "epoch": 0.16283348666053357,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0022021254990249872,
+      "learning_rate": 1e-05,
+      "loss": 0.0621,
+      "num_tokens": 136901941.0,
+      "reward": 0.3671875,
+      "reward_std": 0.2914257347583771,
+      "rewards/accuracy_reward/mean": 0.3671875,
+      "rewards/accuracy_reward/std": 0.4839322865009308,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999858140945435,
+      "sampling/importance_sampling_ratio/min": 2.2461865967216e-07,
+      "sampling/sampling_logp_difference/max": 15.30886173248291,
+      "sampling/sampling_logp_difference/mean": 0.021426808089017868,
+      "step": 177
+    },
+    {
+      "clip_ratio/high_max": 2.5346608254039893e-05,
+      "clip_ratio/high_mean": 7.4063813144675805e-06,
+      "clip_ratio/low_mean": 2.2069365058996482e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.9475746259777225e-05,
+      "completions/clipped_ratio": 0.0546875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16277.0,
+      "completions/mean_length": 7036.953125,
+      "completions/mean_terminated_length": 6496.21484375,
+      "completions/min_length": 4.0,
+      "completions/min_terminated_length": 4.0,
+      "entropy": 0.9684997871518135,
+      "epoch": 0.16375344986200552,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0013461806811392307,
+      "learning_rate": 1e-05,
+      "loss": 0.035,
+      "num_tokens": 137824623.0,
+      "reward": 0.34375,
+      "reward_std": 0.2546031177043915,
+      "rewards/accuracy_reward/mean": 0.34375,
+      "rewards/accuracy_reward/std": 0.47682511806488037,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999944806098938,
+      "sampling/importance_sampling_ratio/min": 5.834372132085264e-05,
+      "sampling/sampling_logp_difference/max": 9.74915885925293,
+      "sampling/sampling_logp_difference/mean": 0.020304443314671516,
+      "step": 178
+    },
+    {
+      "clip_ratio/high_max": 1.3147734080121154e-05,
+      "clip_ratio/high_mean": 3.2869335200302885e-06,
+      "clip_ratio/low_mean": 4.841489999307669e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.170183294467279e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15500.0,
+      "completions/mean_length": 6114.1875,
+      "completions/mean_terminated_length": 5951.1748046875,
+      "completions/min_length": 223.0,
+      "completions/min_terminated_length": 223.0,
+      "entropy": 0.943072073161602,
+      "epoch": 0.16467341306347746,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.002132438588887453,
+      "learning_rate": 1e-05,
+      "loss": 0.0943,
+      "num_tokens": 138625247.0,
+      "reward": 0.40625,
+      "reward_std": 0.321650892496109,
+      "rewards/accuracy_reward/mean": 0.40625,
+      "rewards/accuracy_reward/std": 0.4930621087551117,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999298453330994,
+      "sampling/importance_sampling_ratio/min": 0.0017275095451623201,
+      "sampling/sampling_logp_difference/max": 6.361074447631836,
+      "sampling/sampling_logp_difference/mean": 0.020084267482161522,
+      "step": 179
+    },
+    {
+      "clip_ratio/high_max": 1.7873157958092634e-05,
+      "clip_ratio/high_mean": 4.468289489523158e-06,
+      "clip_ratio/low_mean": 3.5252990301160025e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.9721279790683184e-05,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15050.0,
+      "completions/mean_length": 7618.875,
+      "completions/mean_terminated_length": 7034.53369140625,
+      "completions/min_length": 1030.0,
+      "completions/min_terminated_length": 1030.0,
+      "entropy": 0.9142575263977051,
+      "epoch": 0.1655933762649494,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0026741649489849806,
+      "learning_rate": 1e-05,
+      "loss": 0.0666,
+      "num_tokens": 139619287.0,
+      "reward": 0.2890625,
+      "reward_std": 0.2927239239215851,
+      "rewards/accuracy_reward/mean": 0.2890625,
+      "rewards/accuracy_reward/std": 0.45510825514793396,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998897314071655,
+      "sampling/importance_sampling_ratio/min": 0.005949751473963261,
+      "sampling/sampling_logp_difference/max": 5.124405860900879,
+      "sampling/sampling_logp_difference/mean": 0.020061582326889038,
+      "step": 180
+    },
+    {
+      "clip_ratio/high_max": 1.0512151675357018e-05,
+      "clip_ratio/high_mean": 2.6280379188392544e-06,
+      "clip_ratio/low_mean": 4.5301517502593924e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.792955542143318e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 16106.0,
+      "completions/max_terminated_length": 16106.0,
+      "completions/mean_length": 5333.875,
+      "completions/mean_terminated_length": 5333.875,
+      "completions/min_length": 1109.0,
+      "completions/min_terminated_length": 1109.0,
+      "entropy": 0.8107482865452766,
+      "epoch": 0.16651333946642136,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0027016003150492907,
+      "learning_rate": 1e-05,
+      "loss": 0.0544,
+      "num_tokens": 140318935.0,
+      "reward": 0.5703125,
+      "reward_std": 0.2556639611721039,
+      "rewards/accuracy_reward/mean": 0.5703125,
+      "rewards/accuracy_reward/std": 0.4969765841960907,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.000013828277588,
+      "sampling/importance_sampling_ratio/min": 0.006856904830783606,
+      "sampling/sampling_logp_difference/max": 4.982499122619629,
+      "sampling/sampling_logp_difference/mean": 0.017069874331355095,
+      "step": 181
+    },
+    {
+      "clip_ratio/high_max": 1.85085939392593e-05,
+      "clip_ratio/high_mean": 5.24943533264377e-06,
+      "clip_ratio/low_mean": 5.6120721524166584e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 6.137015702734061e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16050.0,
+      "completions/mean_length": 7443.3046875,
+      "completions/mean_terminated_length": 7154.89501953125,
+      "completions/min_length": 19.0,
+      "completions/min_terminated_length": 19.0,
+      "entropy": 0.9224414080381393,
+      "epoch": 0.16743330266789327,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.002655779244378209,
+      "learning_rate": 1e-05,
+      "loss": 0.0466,
+      "num_tokens": 141293534.0,
+      "reward": 0.234375,
+      "reward_std": 0.2398776262998581,
+      "rewards/accuracy_reward/mean": 0.234375,
+      "rewards/accuracy_reward/std": 0.42527204751968384,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999659061431885,
+      "sampling/importance_sampling_ratio/min": 0.00042018835665658116,
+      "sampling/sampling_logp_difference/max": 7.774807453155518,
+      "sampling/sampling_logp_difference/mean": 0.02006504125893116,
+      "step": 182
+    },
+    {
+      "clip_ratio/high_max": 1.494229445597739e-05,
+      "clip_ratio/high_mean": 3.7355736139943474e-06,
+      "clip_ratio/low_mean": 2.2748562741981004e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.6484136355975352e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15923.0,
+      "completions/max_terminated_length": 15923.0,
+      "completions/mean_length": 5646.6875,
+      "completions/mean_terminated_length": 5646.6875,
+      "completions/min_length": 342.0,
+      "completions/min_terminated_length": 342.0,
+      "entropy": 0.8945339694619179,
+      "epoch": 0.16835326586936522,
+      "frac_reward_zero_std": 0.625,
+      "grad_norm": 0.0016281780553981662,
+      "learning_rate": 1e-05,
+      "loss": 0.0288,
+      "num_tokens": 142037438.0,
+      "reward": 0.46875,
+      "reward_std": 0.17912296950817108,
+      "rewards/accuracy_reward/mean": 0.46875,
+      "rewards/accuracy_reward/std": 0.5009832978248596,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.000030517578125,
+      "sampling/importance_sampling_ratio/min": 0.0005717006279155612,
+      "sampling/sampling_logp_difference/max": 7.46689510345459,
+      "sampling/sampling_logp_difference/mean": 0.019336247816681862,
+      "step": 183
+    },
+    {
+      "clip_ratio/high_max": 3.335990868436056e-05,
+      "clip_ratio/high_mean": 8.33997717109014e-06,
+      "clip_ratio/low_mean": 3.5050728683927446e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.339070608239126e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14142.0,
+      "completions/mean_length": 6384.640625,
+      "completions/mean_terminated_length": 5892.86865234375,
+      "completions/min_length": 130.0,
+      "completions/min_terminated_length": 130.0,
+      "entropy": 0.840093269944191,
+      "epoch": 0.16927322907083717,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.002166559686884284,
+      "learning_rate": 1e-05,
+      "loss": 0.0011,
+      "num_tokens": 142873848.0,
+      "reward": 0.4765625,
+      "reward_std": 0.35506346821784973,
+      "rewards/accuracy_reward/mean": 0.4765625,
+      "rewards/accuracy_reward/std": 0.5014128684997559,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000462532043457,
+      "sampling/importance_sampling_ratio/min": 4.785555574926548e-06,
+      "sampling/sampling_logp_difference/max": 12.249908447265625,
+      "sampling/sampling_logp_difference/mean": 0.018109092488884926,
+      "step": 184
+    },
+    {
+      "clip_ratio/high_max": 1.541105484648142e-05,
+      "clip_ratio/high_mean": 3.852763711620355e-06,
+      "clip_ratio/low_mean": 4.0552770769863855e-05,
+      "clip_ratio/low_min": 7.133888630050933e-06,
+      "clip_ratio/region_mean": 4.440553459517105e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14828.0,
+      "completions/mean_length": 5775.0,
+      "completions/mean_terminated_length": 5691.46435546875,
+      "completions/min_length": 1147.0,
+      "completions/min_terminated_length": 1147.0,
+      "entropy": 0.8915362879633904,
+      "epoch": 0.1701931922723091,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0021932912059128284,
+      "learning_rate": 1e-05,
+      "loss": -0.0086,
+      "num_tokens": 143636152.0,
+      "reward": 0.4375,
+      "reward_std": 0.32325342297554016,
+      "rewards/accuracy_reward/mean": 0.4375,
+      "rewards/accuracy_reward/std": 0.49802759289741516,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.000008225440979,
+      "sampling/importance_sampling_ratio/min": 9.714113069492214e-09,
+      "sampling/sampling_logp_difference/max": 18.44968605041504,
+      "sampling/sampling_logp_difference/mean": 0.019278086721897125,
+      "step": 185
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 2.7509142171311396e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.7509142171311396e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15122.0,
+      "completions/mean_length": 6181.640625,
+      "completions/mean_terminated_length": 6019.69873046875,
+      "completions/min_length": 473.0,
+      "completions/min_terminated_length": 473.0,
+      "entropy": 1.0544511675834656,
+      "epoch": 0.17111315547378106,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.0022947140969336033,
+      "learning_rate": 1e-05,
+      "loss": 0.0242,
+      "num_tokens": 144447370.0,
+      "reward": 0.234375,
+      "reward_std": 0.2022808939218521,
+      "rewards/accuracy_reward/mean": 0.234375,
+      "rewards/accuracy_reward/std": 0.42527204751968384,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999147653579712,
+      "sampling/importance_sampling_ratio/min": 7.419757253046555e-08,
+      "sampling/sampling_logp_difference/max": 16.416534423828125,
+      "sampling/sampling_logp_difference/mean": 0.02050788700580597,
+      "step": 186
+    },
+    {
+      "clip_ratio/high_max": 1.5700999938417226e-05,
+      "clip_ratio/high_mean": 3.9252499846043065e-06,
+      "clip_ratio/low_mean": 2.4595847037289786e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.8521096965050674e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15824.0,
+      "completions/mean_length": 6542.3046875,
+      "completions/mean_terminated_length": 6306.1044921875,
+      "completions/min_length": 628.0,
+      "completions/min_terminated_length": 628.0,
+      "entropy": 0.933225467801094,
+      "epoch": 0.17203311867525298,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0034910975955426693,
+      "learning_rate": 1e-05,
+      "loss": 0.0977,
+      "num_tokens": 145303505.0,
+      "reward": 0.390625,
+      "reward_std": 0.30433881282806396,
+      "rewards/accuracy_reward/mean": 0.390625,
+      "rewards/accuracy_reward/std": 0.4898075461387634,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999945163726807,
+      "sampling/importance_sampling_ratio/min": 0.007213745731860399,
+      "sampling/sampling_logp_difference/max": 4.931766986846924,
+      "sampling/sampling_logp_difference/mean": 0.020022759214043617,
+      "step": 187
+    },
+    {
+      "clip_ratio/high_max": 6.0999414017715026e-06,
+      "clip_ratio/high_mean": 1.5249853504428756e-06,
+      "clip_ratio/low_mean": 2.61421698724007e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.7667155109156738e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15365.0,
+      "completions/mean_length": 5889.4765625,
+      "completions/mean_terminated_length": 5637.6083984375,
+      "completions/min_length": 99.0,
+      "completions/min_terminated_length": 99.0,
+      "entropy": 0.9649673849344254,
+      "epoch": 0.17295308187672492,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0024078311398625374,
+      "learning_rate": 1e-05,
+      "loss": 0.0391,
+      "num_tokens": 146082198.0,
+      "reward": 0.3359375,
+      "reward_std": 0.2698703408241272,
+      "rewards/accuracy_reward/mean": 0.3359375,
+      "rewards/accuracy_reward/std": 0.47417303919792175,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999341368675232,
+      "sampling/importance_sampling_ratio/min": 0.0008680344326421618,
+      "sampling/sampling_logp_difference/max": 7.04927921295166,
+      "sampling/sampling_logp_difference/mean": 0.02060198038816452,
+      "step": 188
+    },
+    {
+      "clip_ratio/high_max": 7.789618393871933e-06,
+      "clip_ratio/high_mean": 1.9474045984679833e-06,
+      "clip_ratio/low_mean": 3.6395756637830345e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.834316100892465e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16233.0,
+      "completions/mean_length": 5349.2421875,
+      "completions/mean_terminated_length": 5084.408203125,
+      "completions/min_length": 678.0,
+      "completions/min_terminated_length": 678.0,
+      "entropy": 0.8402756005525589,
+      "epoch": 0.17387304507819687,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0021191861014813185,
+      "learning_rate": 1e-05,
+      "loss": 0.1275,
+      "num_tokens": 146786245.0,
+      "reward": 0.4765625,
+      "reward_std": 0.2801200747489929,
+      "rewards/accuracy_reward/mean": 0.4765625,
+      "rewards/accuracy_reward/std": 0.5014128684997559,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999837875366211,
+      "sampling/importance_sampling_ratio/min": 3.763807762879878e-05,
+      "sampling/sampling_logp_difference/max": 10.187494277954102,
+      "sampling/sampling_logp_difference/mean": 0.017112664878368378,
+      "step": 189
+    },
+    {
+      "clip_ratio/high_max": 1.2461773394534248e-05,
+      "clip_ratio/high_mean": 3.115443348633562e-06,
+      "clip_ratio/low_mean": 5.095924211673264e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.4074685294835945e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15786.0,
+      "completions/mean_length": 7272.3203125,
+      "completions/mean_terminated_length": 7053.64013671875,
+      "completions/min_length": 1074.0,
+      "completions/min_terminated_length": 1074.0,
+      "entropy": 0.9627499282360077,
+      "epoch": 0.17479300827966882,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0022120666690170765,
+      "learning_rate": 1e-05,
+      "loss": 0.0079,
+      "num_tokens": 147737086.0,
+      "reward": 0.2890625,
+      "reward_std": 0.27304792404174805,
+      "rewards/accuracy_reward/mean": 0.2890625,
+      "rewards/accuracy_reward/std": 0.45510825514793396,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999538660049438,
+      "sampling/importance_sampling_ratio/min": 1.6960719221970066e-05,
+      "sampling/sampling_logp_difference/max": 10.984610557556152,
+      "sampling/sampling_logp_difference/mean": 0.0203307643532753,
+      "step": 190
+    },
+    {
+      "clip_ratio/high_max": 1.7891727566166082e-05,
+      "clip_ratio/high_mean": 4.472931891541521e-06,
+      "clip_ratio/low_mean": 5.616715043288423e-05,
+      "clip_ratio/low_min": 7.80031223257538e-06,
+      "clip_ratio/region_mean": 6.064008221073891e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16212.0,
+      "completions/mean_length": 6387.1875,
+      "completions/mean_terminated_length": 5895.54052734375,
+      "completions/min_length": 1310.0,
+      "completions/min_terminated_length": 1310.0,
+      "entropy": 0.9110158830881119,
+      "epoch": 0.17571297148114076,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0030851473566144705,
+      "learning_rate": 1e-05,
+      "loss": 0.1091,
+      "num_tokens": 148573782.0,
+      "reward": 0.40625,
+      "reward_std": 0.31246688961982727,
+      "rewards/accuracy_reward/mean": 0.40625,
+      "rewards/accuracy_reward/std": 0.4930621087551117,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.99997878074646,
+      "sampling/importance_sampling_ratio/min": 0.003961040172725916,
+      "sampling/sampling_logp_difference/max": 5.531248569488525,
+      "sampling/sampling_logp_difference/mean": 0.018049638718366623,
+      "step": 191
+    },
+    {
+      "clip_ratio/high_max": 1.6994396901282016e-05,
+      "clip_ratio/high_mean": 5.400205964178895e-06,
+      "clip_ratio/low_mean": 3.274822392995702e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.8148429439388565e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16176.0,
+      "completions/mean_length": 7267.59375,
+      "completions/mean_terminated_length": 7195.81103515625,
+      "completions/min_length": 653.0,
+      "completions/min_terminated_length": 653.0,
+      "entropy": 0.9254888147115707,
+      "epoch": 0.1766329346826127,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0020694085396826267,
+      "learning_rate": 1e-05,
+      "loss": 0.0462,
+      "num_tokens": 149521258.0,
+      "reward": 0.2734375,
+      "reward_std": 0.29719972610473633,
+      "rewards/accuracy_reward/mean": 0.2734375,
+      "rewards/accuracy_reward/std": 0.447474867105484,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999054670333862,
+      "sampling/importance_sampling_ratio/min": 7.411616934405174e-06,
+      "sampling/sampling_logp_difference/max": 11.812461853027344,
+      "sampling/sampling_logp_difference/mean": 0.01898832805454731,
+      "step": 192
+    },
+    {
+      "clip_ratio/high_max": 4.10414668294834e-06,
+      "clip_ratio/high_mean": 1.026036670737085e-06,
+      "clip_ratio/low_mean": 4.7441100377909606e-05,
+      "clip_ratio/low_min": 4.552241534838686e-06,
+      "clip_ratio/region_mean": 4.8467136821273016e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16076.0,
+      "completions/mean_length": 7100.1953125,
+      "completions/mean_terminated_length": 6952.83349609375,
+      "completions/min_length": 560.0,
+      "completions/min_terminated_length": 560.0,
+      "entropy": 0.8455610796809196,
+      "epoch": 0.17755289788408463,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.003085972974076867,
+      "learning_rate": 1e-05,
+      "loss": 0.0108,
+      "num_tokens": 150447923.0,
+      "reward": 0.25,
+      "reward_std": 0.23645778000354767,
+      "rewards/accuracy_reward/mean": 0.25,
+      "rewards/accuracy_reward/std": 0.434714138507843,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999178647994995,
+      "sampling/importance_sampling_ratio/min": 0.0011708807433024049,
+      "sampling/sampling_logp_difference/max": 6.749999046325684,
+      "sampling/sampling_logp_difference/mean": 0.01974140852689743,
+      "step": 193
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 1.6514521121280268e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 1.6514521121280268e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15535.0,
+      "completions/mean_length": 6626.4296875,
+      "completions/mean_terminated_length": 6549.5986328125,
+      "completions/min_length": 1746.0,
+      "completions/min_terminated_length": 1746.0,
+      "entropy": 1.0323699787259102,
+      "epoch": 0.17847286108555657,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.003505800850689411,
+      "learning_rate": 1e-05,
+      "loss": 0.0885,
+      "num_tokens": 151313834.0,
+      "reward": 0.390625,
+      "reward_std": 0.17176413536071777,
+      "rewards/accuracy_reward/mean": 0.390625,
+      "rewards/accuracy_reward/std": 0.4898075461387634,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999381303787231,
+      "sampling/importance_sampling_ratio/min": 2.8102756914449856e-05,
+      "sampling/sampling_logp_difference/max": 10.479642868041992,
+      "sampling/sampling_logp_difference/mean": 0.021082937717437744,
+      "step": 194
+    },
+    {
+      "clip_ratio/high_max": 2.006086378969485e-05,
+      "clip_ratio/high_mean": 5.890002398700744e-06,
+      "clip_ratio/low_mean": 3.503898199141986e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.092898473118112e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15595.0,
+      "completions/mean_length": 7093.109375,
+      "completions/mean_terminated_length": 6870.12841796875,
+      "completions/min_length": 69.0,
+      "completions/min_terminated_length": 69.0,
+      "entropy": 1.0206764563918114,
+      "epoch": 0.17939282428702852,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.002495395252481103,
+      "learning_rate": 1e-05,
+      "loss": 0.0308,
+      "num_tokens": 152238192.0,
+      "reward": 0.2890625,
+      "reward_std": 0.31800350546836853,
+      "rewards/accuracy_reward/mean": 0.2890625,
+      "rewards/accuracy_reward/std": 0.45510825514793396,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999728798866272,
+      "sampling/importance_sampling_ratio/min": 9.536534344078973e-05,
+      "sampling/sampling_logp_difference/max": 9.257795333862305,
+      "sampling/sampling_logp_difference/mean": 0.020610272884368896,
+      "step": 195
+    },
+    {
+      "clip_ratio/high_max": 3.2352409107261337e-06,
+      "clip_ratio/high_mean": 8.088102276815334e-07,
+      "clip_ratio/low_mean": 4.056704699451075e-05,
+      "clip_ratio/low_min": 1.1648833606159315e-05,
+      "clip_ratio/region_mean": 4.1375856994818605e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14191.0,
+      "completions/mean_length": 6795.71875,
+      "completions/mean_terminated_length": 6486.4189453125,
+      "completions/min_length": 424.0,
+      "completions/min_terminated_length": 424.0,
+      "entropy": 0.8927837759256363,
+      "epoch": 0.18031278748850046,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0014066790463402867,
+      "learning_rate": 1e-05,
+      "loss": -0.0031,
+      "num_tokens": 153131828.0,
+      "reward": 0.3359375,
+      "reward_std": 0.2409384548664093,
+      "rewards/accuracy_reward/mean": 0.3359375,
+      "rewards/accuracy_reward/std": 0.47417303919792175,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998855590820312,
+      "sampling/importance_sampling_ratio/min": 5.093755135021638e-06,
+      "sampling/sampling_logp_difference/max": 12.187495231628418,
+      "sampling/sampling_logp_difference/mean": 0.01874586008489132,
+      "step": 196
+    },
+    {
+      "clip_ratio/high_max": 1.5244630048982799e-05,
+      "clip_ratio/high_mean": 3.8111575122456998e-06,
+      "clip_ratio/low_mean": 3.655197178886738e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.03631290737394e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15831.0,
+      "completions/mean_length": 7075.1015625,
+      "completions/mean_terminated_length": 6617.28662109375,
+      "completions/min_length": 813.0,
+      "completions/min_terminated_length": 813.0,
+      "entropy": 0.8989318311214447,
+      "epoch": 0.1812327506899724,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0017937121447175741,
+      "learning_rate": 1e-05,
+      "loss": 0.0359,
+      "num_tokens": 154057097.0,
+      "reward": 0.3984375,
+      "reward_std": 0.23068872094154358,
+      "rewards/accuracy_reward/mean": 0.3984375,
+      "rewards/accuracy_reward/std": 0.4915000796318054,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998950958251953,
+      "sampling/importance_sampling_ratio/min": 0.00021659507183358073,
+      "sampling/sampling_logp_difference/max": 8.437480926513672,
+      "sampling/sampling_logp_difference/mean": 0.01890135183930397,
+      "step": 197
+    },
+    {
+      "clip_ratio/high_max": 1.4074375030759256e-05,
+      "clip_ratio/high_mean": 4.977033995601232e-06,
+      "clip_ratio/low_mean": 3.2670792506905855e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.764782627513341e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14100.0,
+      "completions/mean_length": 7120.0,
+      "completions/mean_terminated_length": 6743.41455078125,
+      "completions/min_length": 78.0,
+      "completions/min_terminated_length": 78.0,
+      "entropy": 0.8758384585380554,
+      "epoch": 0.18215271389144433,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.003410576842725277,
+      "learning_rate": 1e-05,
+      "loss": 0.0536,
+      "num_tokens": 154988585.0,
+      "reward": 0.3984375,
+      "reward_std": 0.2845958471298218,
+      "rewards/accuracy_reward/mean": 0.3984375,
+      "rewards/accuracy_reward/std": 0.4915000796318054,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999953508377075,
+      "sampling/importance_sampling_ratio/min": 0.003589102067053318,
+      "sampling/sampling_logp_difference/max": 5.629853248596191,
+      "sampling/sampling_logp_difference/mean": 0.018400676548480988,
+      "step": 198
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 1.977112736994968e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 1.977112736994968e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15821.0,
+      "completions/mean_length": 6590.6796875,
+      "completions/mean_terminated_length": 6513.56689453125,
+      "completions/min_length": 23.0,
+      "completions/min_terminated_length": 23.0,
+      "entropy": 0.9243742749094963,
+      "epoch": 0.18307267709291627,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.003304310142993927,
+      "learning_rate": 1e-05,
+      "loss": 0.0585,
+      "num_tokens": 155851000.0,
+      "reward": 0.3984375,
+      "reward_std": 0.29036980867385864,
+      "rewards/accuracy_reward/mean": 0.3984375,
+      "rewards/accuracy_reward/std": 0.4915000796318054,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999579787254333,
+      "sampling/importance_sampling_ratio/min": 1.2693599273916334e-06,
+      "sampling/sampling_logp_difference/max": 13.576997756958008,
+      "sampling/sampling_logp_difference/mean": 0.01959652081131935,
+      "step": 199
+    },
+    {
+      "clip_ratio/high_max": 1.1435367014200892e-05,
+      "clip_ratio/high_mean": 2.858841753550223e-06,
+      "clip_ratio/low_mean": 4.7742656533955596e-05,
+      "clip_ratio/low_min": 8.646529749967158e-06,
+      "clip_ratio/region_mean": 5.0601498060132144e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16115.0,
+      "completions/mean_length": 6999.484375,
+      "completions/mean_terminated_length": 6696.7578125,
+      "completions/min_length": 445.0,
+      "completions/min_terminated_length": 445.0,
+      "entropy": 0.843244343996048,
+      "epoch": 0.18399264029438822,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0023830258287489414,
+      "learning_rate": 1e-05,
+      "loss": 0.1142,
+      "num_tokens": 156766782.0,
+      "reward": 0.359375,
+      "reward_std": 0.2885475754737854,
+      "rewards/accuracy_reward/mean": 0.359375,
+      "rewards/accuracy_reward/std": 0.481702595949173,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998635053634644,
+      "sampling/importance_sampling_ratio/min": 0.00014761318743694574,
+      "sampling/sampling_logp_difference/max": 8.820915222167969,
+      "sampling/sampling_logp_difference/mean": 0.018434934318065643,
+      "step": 200
+    },
+    {
+      "clip_ratio/high_max": 2.5114631171163637e-05,
+      "clip_ratio/high_mean": 7.040741365926806e-06,
+      "clip_ratio/low_mean": 5.3607667723554187e-05,
+      "clip_ratio/low_min": 9.219345429301029e-06,
+      "clip_ratio/region_mean": 6.064840863473364e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14986.0,
+      "completions/mean_length": 6407.5,
+      "completions/mean_terminated_length": 6249.14306640625,
+      "completions/min_length": 351.0,
+      "completions/min_terminated_length": 351.0,
+      "entropy": 0.9549195989966393,
+      "epoch": 0.18491260349586017,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0024427250027656555,
+      "learning_rate": 1e-05,
+      "loss": 0.0795,
+      "num_tokens": 157606126.0,
+      "reward": 0.3515625,
+      "reward_std": 0.32879000902175903,
+      "rewards/accuracy_reward/mean": 0.3515625,
+      "rewards/accuracy_reward/std": 0.4793342351913452,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999966025352478,
+      "sampling/importance_sampling_ratio/min": 0.0002305622911080718,
+      "sampling/sampling_logp_difference/max": 8.37498950958252,
+      "sampling/sampling_logp_difference/mean": 0.0192743968218565,
+      "step": 201
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 2.928529067958152e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.928529067958152e-05,
+      "completions/clipped_ratio": 0.0703125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15519.0,
+      "completions/mean_length": 6638.390625,
+      "completions/mean_terminated_length": 5901.328125,
+      "completions/min_length": 56.0,
+      "completions/min_terminated_length": 56.0,
+      "entropy": 0.9070822075009346,
+      "epoch": 0.1858325666973321,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.002024515997618437,
+      "learning_rate": 1e-05,
+      "loss": 0.0604,
+      "num_tokens": 158474248.0,
+      "reward": 0.4140625,
+      "reward_std": 0.28117600083351135,
+      "rewards/accuracy_reward/mean": 0.4140625,
+      "rewards/accuracy_reward/std": 0.49449479579925537,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999830722808838,
+      "sampling/importance_sampling_ratio/min": 0.0036068728659301996,
+      "sampling/sampling_logp_difference/max": 5.624914169311523,
+      "sampling/sampling_logp_difference/mean": 0.01955476775765419,
+      "step": 202
+    },
+    {
+      "clip_ratio/high_max": 8.365173471247545e-06,
+      "clip_ratio/high_mean": 2.091293367811886e-06,
+      "clip_ratio/low_mean": 4.1470637825113954e-05,
+      "clip_ratio/low_min": 4.027710474474588e-06,
+      "clip_ratio/region_mean": 4.356193130661268e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15708.0,
+      "completions/mean_length": 7324.546875,
+      "completions/mean_terminated_length": 6878.99951171875,
+      "completions/min_length": 571.0,
+      "completions/min_terminated_length": 571.0,
+      "entropy": 0.9108889549970627,
+      "epoch": 0.18675252989880406,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0022787705529481173,
+      "learning_rate": 1e-05,
+      "loss": 0.0616,
+      "num_tokens": 159434350.0,
+      "reward": 0.3359375,
+      "reward_std": 0.26515230536460876,
+      "rewards/accuracy_reward/mean": 0.3359375,
+      "rewards/accuracy_reward/std": 0.47417303919792175,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999351501464844,
+      "sampling/importance_sampling_ratio/min": 0.03948089852929115,
+      "sampling/sampling_logp_difference/max": 3.231938362121582,
+      "sampling/sampling_logp_difference/mean": 0.019122496247291565,
+      "step": 203
+    },
+    {
+      "clip_ratio/high_max": 8.65733409227687e-06,
+      "clip_ratio/high_mean": 2.1643335230692173e-06,
+      "clip_ratio/low_mean": 3.456336048657249e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.672769389595487e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 13983.0,
+      "completions/mean_length": 5520.4453125,
+      "completions/mean_terminated_length": 5434.9052734375,
+      "completions/min_length": 73.0,
+      "completions/min_terminated_length": 73.0,
+      "entropy": 0.8982062339782715,
+      "epoch": 0.18767249310027598,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0026195270475000143,
+      "learning_rate": 1e-05,
+      "loss": 0.049,
+      "num_tokens": 160163055.0,
+      "reward": 0.4375,
+      "reward_std": 0.24831004440784454,
+      "rewards/accuracy_reward/mean": 0.4375,
+      "rewards/accuracy_reward/std": 0.49802759289741516,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998810291290283,
+      "sampling/importance_sampling_ratio/min": 0.0005541297141462564,
+      "sampling/sampling_logp_difference/max": 7.498111724853516,
+      "sampling/sampling_logp_difference/mean": 0.019064132124185562,
+      "step": 204
+    },
+    {
+      "clip_ratio/high_max": 1.8376186289970065e-05,
+      "clip_ratio/high_mean": 6.650576210631698e-06,
+      "clip_ratio/low_mean": 4.059042771586974e-05,
+      "clip_ratio/low_min": 5.350111223378917e-06,
+      "clip_ratio/region_mean": 4.724100449493562e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15267.0,
+      "completions/max_terminated_length": 15267.0,
+      "completions/mean_length": 6846.515625,
+      "completions/mean_terminated_length": 6846.515625,
+      "completions/min_length": 63.0,
+      "completions/min_terminated_length": 63.0,
+      "entropy": 0.9657742157578468,
+      "epoch": 0.18859245630174792,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0014831912703812122,
+      "learning_rate": 1e-05,
+      "loss": 0.006,
+      "num_tokens": 161057657.0,
+      "reward": 0.296875,
+      "reward_std": 0.27198708057403564,
+      "rewards/accuracy_reward/mean": 0.296875,
+      "rewards/accuracy_reward/std": 0.45867621898651123,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999252557754517,
+      "sampling/importance_sampling_ratio/min": 6.252834282349795e-05,
+      "sampling/sampling_logp_difference/max": 9.679890632629395,
+      "sampling/sampling_logp_difference/mean": 0.020372584462165833,
+      "step": 205
+    },
+    {
+      "clip_ratio/high_max": 1.658901419432368e-05,
+      "clip_ratio/high_mean": 4.14725354858092e-06,
+      "clip_ratio/low_mean": 4.473214539757464e-05,
+      "clip_ratio/low_min": 2.9674999950657366e-06,
+      "clip_ratio/region_mean": 4.887939894615556e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16370.0,
+      "completions/mean_length": 6946.8984375,
+      "completions/mean_terminated_length": 6642.4755859375,
+      "completions/min_length": 1133.0,
+      "completions/min_terminated_length": 1133.0,
+      "entropy": 0.8490508273243904,
+      "epoch": 0.18951241950321987,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0017962189158424735,
+      "learning_rate": 1e-05,
+      "loss": 0.0696,
+      "num_tokens": 161966356.0,
+      "reward": 0.4296875,
+      "reward_std": 0.33114415407180786,
+      "rewards/accuracy_reward/mean": 0.4296875,
+      "rewards/accuracy_reward/std": 0.4969765841960907,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999545216560364,
+      "sampling/importance_sampling_ratio/min": 7.035569433355704e-05,
+      "sampling/sampling_logp_difference/max": 9.561946868896484,
+      "sampling/sampling_logp_difference/mean": 0.019146796315908432,
+      "step": 206
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 3.22491199540309e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.22491199540309e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15123.0,
+      "completions/mean_length": 6618.9765625,
+      "completions/mean_terminated_length": 6463.9765625,
+      "completions/min_length": 529.0,
+      "completions/min_terminated_length": 529.0,
+      "entropy": 0.9541772454977036,
+      "epoch": 0.19043238270469182,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0017619321588426828,
+      "learning_rate": 1e-05,
+      "loss": 0.0509,
+      "num_tokens": 162836705.0,
+      "reward": 0.390625,
+      "reward_std": 0.2130674123764038,
+      "rewards/accuracy_reward/mean": 0.390625,
+      "rewards/accuracy_reward/std": 0.4898075461387634,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999436140060425,
+      "sampling/importance_sampling_ratio/min": 4.2106199771296815e-07,
+      "sampling/sampling_logp_difference/max": 14.680485725402832,
+      "sampling/sampling_logp_difference/mean": 0.020236656069755554,
+      "step": 207
+    },
+    {
+      "clip_ratio/high_max": 1.6846054222696694e-05,
+      "clip_ratio/high_mean": 4.211513555674173e-06,
+      "clip_ratio/low_mean": 3.877300162002939e-05,
+      "clip_ratio/low_min": 4.230834292684449e-06,
+      "clip_ratio/region_mean": 4.298451551676408e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 12469.0,
+      "completions/mean_length": 5485.71875,
+      "completions/mean_terminated_length": 5312.73046875,
+      "completions/min_length": 104.0,
+      "completions/min_terminated_length": 104.0,
+      "entropy": 0.8888534903526306,
+      "epoch": 0.19135234590616376,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.002670915797352791,
+      "learning_rate": 1e-05,
+      "loss": 0.0709,
+      "num_tokens": 163558197.0,
+      "reward": 0.46875,
+      "reward_std": 0.3145885467529297,
+      "rewards/accuracy_reward/mean": 0.46875,
+      "rewards/accuracy_reward/std": 0.5009832978248596,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000442266464233,
+      "sampling/importance_sampling_ratio/min": 0.0005042250850237906,
+      "sampling/sampling_logp_difference/max": 7.592487812042236,
+      "sampling/sampling_logp_difference/mean": 0.019581373780965805,
+      "step": 208
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 3.6889288480779214e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.6889288480779214e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16184.0,
+      "completions/mean_length": 4345.171875,
+      "completions/mean_terminated_length": 4250.3779296875,
+      "completions/min_length": 68.0,
+      "completions/min_terminated_length": 68.0,
+      "entropy": 0.8308270424604416,
+      "epoch": 0.1922723091076357,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.004005427472293377,
+      "learning_rate": 1e-05,
+      "loss": 0.1072,
+      "num_tokens": 164133499.0,
+      "reward": 0.578125,
+      "reward_std": 0.31642353534698486,
+      "rewards/accuracy_reward/mean": 0.578125,
+      "rewards/accuracy_reward/std": 0.4957992732524872,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999247193336487,
+      "sampling/importance_sampling_ratio/min": 0.022981969639658928,
+      "sampling/sampling_logp_difference/max": 3.773045301437378,
+      "sampling/sampling_logp_difference/mean": 0.017508968710899353,
+      "step": 209
+    },
+    {
+      "clip_ratio/high_max": 1.2997116300539346e-05,
+      "clip_ratio/high_mean": 3.2492790751348366e-06,
+      "clip_ratio/low_mean": 2.723402121773688e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.0483300406558556e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15509.0,
+      "completions/mean_length": 5227.296875,
+      "completions/mean_terminated_length": 5050.20654296875,
+      "completions/min_length": 5.0,
+      "completions/min_terminated_length": 5.0,
+      "entropy": 0.9231975972652435,
+      "epoch": 0.19319227230910763,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0031033784616738558,
+      "learning_rate": 1e-05,
+      "loss": 0.0179,
+      "num_tokens": 164823681.0,
+      "reward": 0.4765625,
+      "reward_std": 0.29249146580696106,
+      "rewards/accuracy_reward/mean": 0.4765625,
+      "rewards/accuracy_reward/std": 0.5014128684997559,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999896764755249,
+      "sampling/importance_sampling_ratio/min": 0.0021342060063034296,
+      "sampling/sampling_logp_difference/max": 6.149660587310791,
+      "sampling/sampling_logp_difference/mean": 0.019171088933944702,
+      "step": 210
+    },
+    {
+      "clip_ratio/high_max": 2.0835890609305352e-05,
+      "clip_ratio/high_mean": 5.208972652326338e-06,
+      "clip_ratio/low_mean": 2.9314877565411734e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.452385044511175e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14160.0,
+      "completions/mean_length": 6473.4765625,
+      "completions/mean_terminated_length": 6316.1669921875,
+      "completions/min_length": 726.0,
+      "completions/min_terminated_length": 726.0,
+      "entropy": 0.9061874598264694,
+      "epoch": 0.19411223551057957,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.003495733719319105,
+      "learning_rate": 1e-05,
+      "loss": 0.0785,
+      "num_tokens": 165668798.0,
+      "reward": 0.4765625,
+      "reward_std": 0.3469353914260864,
+      "rewards/accuracy_reward/mean": 0.4765625,
+      "rewards/accuracy_reward/std": 0.5014128684997559,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000354051589966,
+      "sampling/importance_sampling_ratio/min": 0.0004697878030128777,
+      "sampling/sampling_logp_difference/max": 7.663229465484619,
+      "sampling/sampling_logp_difference/mean": 0.018978482112288475,
+      "step": 211
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 3.991967162164656e-05,
+      "clip_ratio/low_min": 6.304534053924726e-06,
+      "clip_ratio/region_mean": 3.991967162164656e-05,
+      "completions/clipped_ratio": 0.0546875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14659.0,
+      "completions/mean_length": 7140.1953125,
+      "completions/mean_terminated_length": 6605.4296875,
+      "completions/min_length": 141.0,
+      "completions/min_terminated_length": 141.0,
+      "entropy": 0.9605444446206093,
+      "epoch": 0.19503219871205152,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.002381941769272089,
+      "learning_rate": 1e-05,
+      "loss": 0.027,
+      "num_tokens": 166603375.0,
+      "reward": 0.3046875,
+      "reward_std": 0.27776598930358887,
+      "rewards/accuracy_reward/mean": 0.3046875,
+      "rewards/accuracy_reward/std": 0.46208351850509644,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999864935874939,
+      "sampling/importance_sampling_ratio/min": 0.00043123820796608925,
+      "sampling/sampling_logp_difference/max": 7.748849868774414,
+      "sampling/sampling_logp_difference/mean": 0.021141134202480316,
+      "step": 212
+    },
+    {
+      "clip_ratio/high_max": 1.4948576790629886e-05,
+      "clip_ratio/high_mean": 3.7371441976574715e-06,
+      "clip_ratio/low_mean": 3.4953729482367635e-05,
+      "clip_ratio/low_min": 3.991060111729894e-06,
+      "clip_ratio/region_mean": 3.869087413477246e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 13770.0,
+      "completions/mean_length": 5304.46875,
+      "completions/mean_terminated_length": 5038.56005859375,
+      "completions/min_length": 342.0,
+      "completions/min_terminated_length": 342.0,
+      "entropy": 0.9176690131425858,
+      "epoch": 0.19595216191352346,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0040566748939454556,
+      "learning_rate": 1e-05,
+      "loss": 0.0534,
+      "num_tokens": 167302275.0,
+      "reward": 0.4296875,
+      "reward_std": 0.33114415407180786,
+      "rewards/accuracy_reward/mean": 0.4296875,
+      "rewards/accuracy_reward/std": 0.4969765841960907,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999827742576599,
+      "sampling/importance_sampling_ratio/min": 5.001809313398553e-07,
+      "sampling/sampling_logp_difference/max": 14.508296012878418,
+      "sampling/sampling_logp_difference/mean": 0.018822530284523964,
+      "step": 213
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 2.653866999935417e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.653866999935417e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15791.0,
+      "completions/mean_length": 5796.5,
+      "completions/mean_terminated_length": 5542.400390625,
+      "completions/min_length": 407.0,
+      "completions/min_terminated_length": 407.0,
+      "entropy": 0.9230027198791504,
+      "epoch": 0.1968721251149954,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0021502040326595306,
+      "learning_rate": 1e-05,
+      "loss": 0.0737,
+      "num_tokens": 168063627.0,
+      "reward": 0.3828125,
+      "reward_std": 0.3158818483352661,
+      "rewards/accuracy_reward/mean": 0.3828125,
+      "rewards/accuracy_reward/std": 0.4879830479621887,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999223351478577,
+      "sampling/importance_sampling_ratio/min": 0.009504453279078007,
+      "sampling/sampling_logp_difference/max": 4.655994892120361,
+      "sampling/sampling_logp_difference/mean": 0.01985779032111168,
+      "step": 214
+    },
+    {
+      "clip_ratio/high_max": 1.0863841453101486e-05,
+      "clip_ratio/high_mean": 2.7159603632753715e-06,
+      "clip_ratio/low_mean": 2.4175752741939505e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.6891713218901714e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14814.0,
+      "completions/mean_length": 6135.4921875,
+      "completions/mean_terminated_length": 6054.79541015625,
+      "completions/min_length": 1259.0,
+      "completions/min_terminated_length": 1259.0,
+      "entropy": 0.869445689022541,
+      "epoch": 0.19779208831646733,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0027786416467279196,
+      "learning_rate": 1e-05,
+      "loss": 0.0139,
+      "num_tokens": 168867858.0,
+      "reward": 0.4609375,
+      "reward_std": 0.3366856575012207,
+      "rewards/accuracy_reward/mean": 0.4609375,
+      "rewards/accuracy_reward/std": 0.5004304051399231,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999550580978394,
+      "sampling/importance_sampling_ratio/min": 2.6089865059475414e-05,
+      "sampling/sampling_logp_difference/max": 10.553963661193848,
+      "sampling/sampling_logp_difference/mean": 0.018514130264520645,
+      "step": 215
+    },
+    {
+      "clip_ratio/high_max": 4.36788013757905e-06,
+      "clip_ratio/high_mean": 1.0919700343947625e-06,
+      "clip_ratio/low_mean": 1.993327998661698e-06,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.0852980330564606e-06,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15691.0,
+      "completions/mean_length": 6268.2421875,
+      "completions/mean_terminated_length": 6025.46435546875,
+      "completions/min_length": 627.0,
+      "completions/min_terminated_length": 627.0,
+      "entropy": 0.951081782579422,
+      "epoch": 0.19871205151793928,
+      "frac_reward_zero_std": 0.75,
+      "grad_norm": 0.0007328780484385788,
+      "learning_rate": 1e-05,
+      "loss": 0.0188,
+      "num_tokens": 169689969.0,
+      "reward": 0.3828125,
+      "reward_std": 0.10994865000247955,
+      "rewards/accuracy_reward/mean": 0.3828125,
+      "rewards/accuracy_reward/std": 0.4879830479621887,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000133514404297,
+      "sampling/importance_sampling_ratio/min": 1.6650999896228313e-05,
+      "sampling/sampling_logp_difference/max": 11.003040313720703,
+      "sampling/sampling_logp_difference/mean": 0.02005261555314064,
+      "step": 216
+    },
+    {
+      "clip_ratio/high_max": 2.131336282218399e-05,
+      "clip_ratio/high_mean": 5.3283407055459975e-06,
+      "clip_ratio/low_mean": 3.5254403428552905e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.058274430462916e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 13861.0,
+      "completions/mean_length": 5440.8984375,
+      "completions/mean_terminated_length": 5354.732421875,
+      "completions/min_length": 413.0,
+      "completions/min_terminated_length": 413.0,
+      "entropy": 0.8271932750940323,
+      "epoch": 0.19963201471941122,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0034721922129392624,
+      "learning_rate": 1e-05,
+      "loss": -0.0245,
+      "num_tokens": 170409292.0,
+      "reward": 0.53125,
+      "reward_std": 0.30327308177948,
+      "rewards/accuracy_reward/mean": 0.53125,
+      "rewards/accuracy_reward/std": 0.5009832978248596,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998912811279297,
+      "sampling/importance_sampling_ratio/min": 1.8372484191786498e-05,
+      "sampling/sampling_logp_difference/max": 10.904656410217285,
+      "sampling/sampling_logp_difference/mean": 0.019136395305395126,
+      "step": 217
+    },
+    {
+      "clip_ratio/high_max": 1.2339016848272877e-05,
+      "clip_ratio/high_mean": 4.13687178024702e-06,
+      "clip_ratio/low_mean": 2.156280152121326e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.569967330146028e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15086.0,
+      "completions/mean_length": 6671.046875,
+      "completions/mean_terminated_length": 6594.56689453125,
+      "completions/min_length": 748.0,
+      "completions/min_terminated_length": 748.0,
+      "entropy": 0.9659745842218399,
+      "epoch": 0.20055197792088317,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0027575206477195024,
+      "learning_rate": 1e-05,
+      "loss": 0.0286,
+      "num_tokens": 171280714.0,
+      "reward": 0.375,
+      "reward_std": 0.2109457552433014,
+      "rewards/accuracy_reward/mean": 0.375,
+      "rewards/accuracy_reward/std": 0.4860251843929291,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999411702156067,
+      "sampling/importance_sampling_ratio/min": 1.5700872609158978e-05,
+      "sampling/sampling_logp_difference/max": 11.06179428100586,
+      "sampling/sampling_logp_difference/mean": 0.019089506939053535,
+      "step": 218
+    },
+    {
+      "clip_ratio/high_max": 1.4603458112105727e-05,
+      "clip_ratio/high_mean": 3.650864528026432e-06,
+      "clip_ratio/low_mean": 3.2977761520669446e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.662862599185246e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15752.0,
+      "completions/mean_length": 7781.5546875,
+      "completions/mean_terminated_length": 7504.05615234375,
+      "completions/min_length": 429.0,
+      "completions/min_terminated_length": 429.0,
+      "entropy": 1.1691131889820099,
+      "epoch": 0.2014719411223551,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.0012711051385849714,
+      "learning_rate": 1e-05,
+      "loss": 0.0115,
+      "num_tokens": 172302489.0,
+      "reward": 0.109375,
+      "reward_std": 0.1751839816570282,
+      "rewards/accuracy_reward/mean": 0.109375,
+      "rewards/accuracy_reward/std": 0.31333550810813904,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998820424079895,
+      "sampling/importance_sampling_ratio/min": 0.005086081102490425,
+      "sampling/sampling_logp_difference/max": 5.281247615814209,
+      "sampling/sampling_logp_difference/mean": 0.023309212177991867,
+      "step": 219
+    },
+    {
+      "clip_ratio/high_max": 6.842087486802484e-06,
+      "clip_ratio/high_mean": 1.710521871700621e-06,
+      "clip_ratio/low_mean": 4.5269940528669395e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.6980462457213434e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14891.0,
+      "completions/mean_length": 6489.96875,
+      "completions/mean_terminated_length": 6332.9208984375,
+      "completions/min_length": 556.0,
+      "completions/min_terminated_length": 556.0,
+      "entropy": 0.9354017227888107,
+      "epoch": 0.20239190432382706,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0016933141741901636,
+      "learning_rate": 1e-05,
+      "loss": 0.0156,
+      "num_tokens": 173149653.0,
+      "reward": 0.484375,
+      "reward_std": 0.32325342297554016,
+      "rewards/accuracy_reward/mean": 0.484375,
+      "rewards/accuracy_reward/std": 0.5017194747924805,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999572038650513,
+      "sampling/importance_sampling_ratio/min": 0.008998609147965908,
+      "sampling/sampling_logp_difference/max": 4.7106852531433105,
+      "sampling/sampling_logp_difference/mean": 0.019165027886629105,
+      "step": 220
+    },
+    {
+      "clip_ratio/high_max": 2.444740721330163e-05,
+      "clip_ratio/high_mean": 6.111851803325408e-06,
+      "clip_ratio/low_mean": 3.0998270403870265e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.711012095664046e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 14943.0,
+      "completions/max_terminated_length": 14943.0,
+      "completions/mean_length": 6309.75,
+      "completions/mean_terminated_length": 6309.75,
+      "completions/min_length": 474.0,
+      "completions/min_terminated_length": 474.0,
+      "entropy": 1.012483686208725,
+      "epoch": 0.20331186752529898,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0024940327275544405,
+      "learning_rate": 1e-05,
+      "loss": 0.0552,
+      "num_tokens": 173976797.0,
+      "reward": 0.4375,
+      "reward_std": 0.2790592610836029,
+      "rewards/accuracy_reward/mean": 0.4375,
+      "rewards/accuracy_reward/std": 0.49802759289741516,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999861121177673,
+      "sampling/importance_sampling_ratio/min": 0.0018720829393714666,
+      "sampling/sampling_logp_difference/max": 6.280703544616699,
+      "sampling/sampling_logp_difference/mean": 0.020797956734895706,
+      "step": 221
+    },
+    {
+      "clip_ratio/high_max": 1.1112337460872368e-05,
+      "clip_ratio/high_mean": 3.5388877677178243e-06,
+      "clip_ratio/low_mean": 1.7024583712554886e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.056347148027271e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16362.0,
+      "completions/mean_length": 7574.984375,
+      "completions/mean_terminated_length": 7363.568359375,
+      "completions/min_length": 598.0,
+      "completions/min_terminated_length": 598.0,
+      "entropy": 0.9144782647490501,
+      "epoch": 0.20423183072677092,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.002748408354818821,
+      "learning_rate": 1e-05,
+      "loss": 0.0588,
+      "num_tokens": 174965259.0,
+      "reward": 0.2734375,
+      "reward_std": 0.25224411487579346,
+      "rewards/accuracy_reward/mean": 0.2734375,
+      "rewards/accuracy_reward/std": 0.447474867105484,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000108480453491,
+      "sampling/importance_sampling_ratio/min": 0.005681300535798073,
+      "sampling/sampling_logp_difference/max": 5.170575141906738,
+      "sampling/sampling_logp_difference/mean": 0.019229793921113014,
+      "step": 222
+    },
+    {
+      "clip_ratio/high_max": 1.4946090004741563e-05,
+      "clip_ratio/high_mean": 3.736522501185391e-06,
+      "clip_ratio/low_mean": 3.722507381098694e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.096159636901575e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15365.0,
+      "completions/mean_length": 6962.7734375,
+      "completions/mean_terminated_length": 6499.43408203125,
+      "completions/min_length": 780.0,
+      "completions/min_terminated_length": 780.0,
+      "entropy": 0.9248140156269073,
+      "epoch": 0.20515179392824287,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0020343128126114607,
+      "learning_rate": 1e-05,
+      "loss": 0.0714,
+      "num_tokens": 175876446.0,
+      "reward": 0.421875,
+      "reward_std": 0.3156445026397705,
+      "rewards/accuracy_reward/mean": 0.421875,
+      "rewards/accuracy_reward/std": 0.4957992732524872,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999679327011108,
+      "sampling/importance_sampling_ratio/min": 0.0001609467581147328,
+      "sampling/sampling_logp_difference/max": 8.734436988830566,
+      "sampling/sampling_logp_difference/mean": 0.01860032044351101,
+      "step": 223
+    },
+    {
+      "clip_ratio/high_max": 4.226114015182247e-06,
+      "clip_ratio/high_mean": 1.0565285037955618e-06,
+      "clip_ratio/low_mean": 3.189400638348161e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.295053488727717e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14978.0,
+      "completions/mean_length": 6422.28125,
+      "completions/mean_terminated_length": 6264.1591796875,
+      "completions/min_length": 117.0,
+      "completions/min_terminated_length": 117.0,
+      "entropy": 0.7786787301301956,
+      "epoch": 0.20607175712971482,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0029119597747921944,
+      "learning_rate": 1e-05,
+      "loss": 0.1116,
+      "num_tokens": 176717226.0,
+      "reward": 0.578125,
+      "reward_std": 0.27328526973724365,
+      "rewards/accuracy_reward/mean": 0.578125,
+      "rewards/accuracy_reward/std": 0.4957992732524872,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999918937683105,
+      "sampling/importance_sampling_ratio/min": 0.0006287595024332404,
+      "sampling/sampling_logp_difference/max": 7.371761798858643,
+      "sampling/sampling_logp_difference/mean": 0.01786171644926071,
+      "step": 224
+    },
+    {
+      "clip_ratio/high_max": 5.4112551879370585e-06,
+      "clip_ratio/high_mean": 1.3528137969842646e-06,
+      "clip_ratio/low_mean": 2.103693077515345e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.2389744572137715e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16030.0,
+      "completions/mean_length": 6662.65625,
+      "completions/mean_terminated_length": 6508.349609375,
+      "completions/min_length": 486.0,
+      "completions/min_terminated_length": 486.0,
+      "entropy": 0.9501350447535515,
+      "epoch": 0.20699172033118676,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.0027519147843122482,
+      "learning_rate": 1e-05,
+      "loss": 0.0204,
+      "num_tokens": 177586766.0,
+      "reward": 0.421875,
+      "reward_std": 0.21382881700992584,
+      "rewards/accuracy_reward/mean": 0.421875,
+      "rewards/accuracy_reward/std": 0.4957992732524872,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000051259994507,
+      "sampling/importance_sampling_ratio/min": 2.507045428501442e-05,
+      "sampling/sampling_logp_difference/max": 10.593820571899414,
+      "sampling/sampling_logp_difference/mean": 0.020679686218500137,
+      "step": 225
+    },
+    {
+      "clip_ratio/high_max": 3.2487785119883483e-06,
+      "clip_ratio/high_mean": 8.121946279970871e-07,
+      "clip_ratio/low_mean": 5.783435085504607e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.8646545539886574e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15419.0,
+      "completions/mean_length": 6546.171875,
+      "completions/mean_terminated_length": 6146.259765625,
+      "completions/min_length": 839.0,
+      "completions/min_terminated_length": 839.0,
+      "entropy": 0.9217342138290405,
+      "epoch": 0.20791168353265868,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0017936143558472395,
+      "learning_rate": 1e-05,
+      "loss": 0.0748,
+      "num_tokens": 178444556.0,
+      "reward": 0.3984375,
+      "reward_std": 0.2909066081047058,
+      "rewards/accuracy_reward/mean": 0.3984375,
+      "rewards/accuracy_reward/std": 0.4915000796318054,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000327825546265,
+      "sampling/importance_sampling_ratio/min": 8.447741129202768e-05,
+      "sampling/sampling_logp_difference/max": 9.379026412963867,
+      "sampling/sampling_logp_difference/mean": 0.019764548167586327,
+      "step": 226
+    },
+    {
+      "clip_ratio/high_max": 2.1980493102091714e-05,
+      "clip_ratio/high_mean": 5.4951232755229285e-06,
+      "clip_ratio/low_mean": 4.3977801396977156e-05,
+      "clip_ratio/low_min": 7.912247156127705e-06,
+      "clip_ratio/region_mean": 4.947292427459615e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15707.0,
+      "completions/max_terminated_length": 15707.0,
+      "completions/mean_length": 6433.9296875,
+      "completions/mean_terminated_length": 6433.9296875,
+      "completions/min_length": 731.0,
+      "completions/min_terminated_length": 731.0,
+      "entropy": 0.9361409991979599,
+      "epoch": 0.20883164673413063,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0031324021983891726,
+      "learning_rate": 1e-05,
+      "loss": 0.0505,
+      "num_tokens": 179288499.0,
+      "reward": 0.453125,
+      "reward_std": 0.3066929578781128,
+      "rewards/accuracy_reward/mean": 0.453125,
+      "rewards/accuracy_reward/std": 0.4997538626194,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999454021453857,
+      "sampling/importance_sampling_ratio/min": 0.00018488657951820642,
+      "sampling/sampling_logp_difference/max": 8.595767974853516,
+      "sampling/sampling_logp_difference/mean": 0.019691072404384613,
+      "step": 227
+    },
+    {
+      "clip_ratio/high_max": 1.299416817346355e-05,
+      "clip_ratio/high_mean": 3.2485420433658874e-06,
+      "clip_ratio/low_mean": 3.756406420052372e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.081260635757644e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15787.0,
+      "completions/mean_length": 6037.75,
+      "completions/mean_terminated_length": 5873.52392578125,
+      "completions/min_length": 551.0,
+      "completions/min_terminated_length": 551.0,
+      "entropy": 0.8700985535979271,
+      "epoch": 0.20975160993560257,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0024714914616197348,
+      "learning_rate": 1e-05,
+      "loss": 0.0044,
+      "num_tokens": 180079619.0,
+      "reward": 0.484375,
+      "reward_std": 0.21436560153961182,
+      "rewards/accuracy_reward/mean": 0.484375,
+      "rewards/accuracy_reward/std": 0.5017194747924805,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999628067016602,
+      "sampling/importance_sampling_ratio/min": 8.4841696661897e-05,
+      "sampling/sampling_logp_difference/max": 9.374723434448242,
+      "sampling/sampling_logp_difference/mean": 0.018519341945648193,
+      "step": 228
+    },
+    {
+      "clip_ratio/high_max": 7.293307589861797e-06,
+      "clip_ratio/high_mean": 1.8233268974654493e-06,
+      "clip_ratio/low_mean": 2.2305866423266707e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.412919320704532e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 12264.0,
+      "completions/max_terminated_length": 12264.0,
+      "completions/mean_length": 5305.828125,
+      "completions/mean_terminated_length": 5305.828125,
+      "completions/min_length": 229.0,
+      "completions/min_terminated_length": 229.0,
+      "entropy": 1.1309608668088913,
+      "epoch": 0.21067157313707452,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.003593914210796356,
+      "learning_rate": 1e-05,
+      "loss": 0.0478,
+      "num_tokens": 180780877.0,
+      "reward": 0.3984375,
+      "reward_std": 0.24671241641044617,
+      "rewards/accuracy_reward/mean": 0.3984375,
+      "rewards/accuracy_reward/std": 0.4915000796318054,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.000011920928955,
+      "sampling/importance_sampling_ratio/min": 0.009941472671926022,
+      "sampling/sampling_logp_difference/max": 4.611040115356445,
+      "sampling/sampling_logp_difference/mean": 0.020471621304750443,
+      "step": 229
+    },
+    {
+      "clip_ratio/high_max": 2.0163415001661633e-05,
+      "clip_ratio/high_mean": 5.040853750415408e-06,
+      "clip_ratio/low_mean": 4.4980357415624894e-05,
+      "clip_ratio/low_min": 1.0012816346716136e-05,
+      "clip_ratio/region_mean": 5.0021211109196884e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 13814.0,
+      "completions/mean_length": 6022.96875,
+      "completions/mean_terminated_length": 5774.30419921875,
+      "completions/min_length": 556.0,
+      "completions/min_terminated_length": 556.0,
+      "entropy": 0.8560900762677193,
+      "epoch": 0.21159153633854647,
+      "frac_reward_zero_std": 0.125,
+      "grad_norm": 0.0029816587921231985,
+      "learning_rate": 1e-05,
+      "loss": 0.0913,
+      "num_tokens": 181571465.0,
+      "reward": 0.515625,
+      "reward_std": 0.41504397988319397,
+      "rewards/accuracy_reward/mean": 0.515625,
+      "rewards/accuracy_reward/std": 0.5017194747924805,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999518394470215,
+      "sampling/importance_sampling_ratio/min": 1.5958334188326262e-05,
+      "sampling/sampling_logp_difference/max": 11.04552936553955,
+      "sampling/sampling_logp_difference/mean": 0.0181986466050148,
+      "step": 230
+    },
+    {
+      "clip_ratio/high_max": 1.8430865566188004e-05,
+      "clip_ratio/high_mean": 6.177042905619601e-06,
+      "clip_ratio/low_mean": 4.450247388376738e-05,
+      "clip_ratio/low_min": 4.840271230932558e-06,
+      "clip_ratio/region_mean": 5.067951724413433e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15130.0,
+      "completions/max_terminated_length": 15130.0,
+      "completions/mean_length": 6647.71875,
+      "completions/mean_terminated_length": 6647.71875,
+      "completions/min_length": 196.0,
+      "completions/min_terminated_length": 196.0,
+      "entropy": 0.9455481320619583,
+      "epoch": 0.2125114995400184,
+      "frac_reward_zero_std": 0.125,
+      "grad_norm": 0.0031632622703909874,
+      "learning_rate": 1e-05,
+      "loss": 0.1317,
+      "num_tokens": 182440957.0,
+      "reward": 0.3828125,
+      "reward_std": 0.39902517199516296,
+      "rewards/accuracy_reward/mean": 0.3828125,
+      "rewards/accuracy_reward/std": 0.4879830479621887,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000306367874146,
+      "sampling/importance_sampling_ratio/min": 1.4739508515049238e-05,
+      "sampling/sampling_logp_difference/max": 11.124979019165039,
+      "sampling/sampling_logp_difference/mean": 0.01906408555805683,
+      "step": 231
+    },
+    {
+      "clip_ratio/high_max": 2.2937053017813014e-05,
+      "clip_ratio/high_mean": 5.7342632544532535e-06,
+      "clip_ratio/low_mean": 6.042617155799235e-05,
+      "clip_ratio/low_min": 1.1000354334100848e-05,
+      "clip_ratio/region_mean": 6.616043401663774e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15988.0,
+      "completions/mean_length": 6809.1640625,
+      "completions/mean_terminated_length": 6500.29833984375,
+      "completions/min_length": 471.0,
+      "completions/min_terminated_length": 471.0,
+      "entropy": 1.050546184182167,
+      "epoch": 0.21343146274149033,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.00162694591563195,
+      "learning_rate": 1e-05,
+      "loss": 0.0346,
+      "num_tokens": 183332242.0,
+      "reward": 0.421875,
+      "reward_std": 0.33616161346435547,
+      "rewards/accuracy_reward/mean": 0.421875,
+      "rewards/accuracy_reward/std": 0.4957992732524872,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000290870666504,
+      "sampling/importance_sampling_ratio/min": 4.244970114086755e-06,
+      "sampling/sampling_logp_difference/max": 12.369775772094727,
+      "sampling/sampling_logp_difference/mean": 0.021866722032427788,
+      "step": 232
+    },
+    {
+      "clip_ratio/high_max": 1.4678411844215589e-05,
+      "clip_ratio/high_mean": 3.669602961053897e-06,
+      "clip_ratio/low_mean": 2.4373607971028832e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.8043211159456405e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16348.0,
+      "completions/mean_length": 6815.5,
+      "completions/mean_terminated_length": 6506.83837890625,
+      "completions/min_length": 51.0,
+      "completions/min_terminated_length": 51.0,
+      "entropy": 1.060033954679966,
+      "epoch": 0.21435142594296228,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0024887355975806713,
+      "learning_rate": 1e-05,
+      "loss": 0.1059,
+      "num_tokens": 184225138.0,
+      "reward": 0.328125,
+      "reward_std": 0.2869548499584198,
+      "rewards/accuracy_reward/mean": 0.328125,
+      "rewards/accuracy_reward/std": 0.4713755249977112,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999393820762634,
+      "sampling/importance_sampling_ratio/min": 0.00012930770753882825,
+      "sampling/sampling_logp_difference/max": 8.953315734863281,
+      "sampling/sampling_logp_difference/mean": 0.02019432932138443,
+      "step": 233
+    },
+    {
+      "clip_ratio/high_max": 7.910891326901037e-06,
+      "clip_ratio/high_mean": 1.9777228317252593e-06,
+      "clip_ratio/low_mean": 3.8802519611635944e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.078024221598753e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15838.0,
+      "completions/mean_length": 6928.4453125,
+      "completions/mean_terminated_length": 6623.42724609375,
+      "completions/min_length": 304.0,
+      "completions/min_terminated_length": 304.0,
+      "entropy": 0.9051575735211372,
+      "epoch": 0.21527138914443422,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.002783838426694274,
+      "learning_rate": 1e-05,
+      "loss": 0.0624,
+      "num_tokens": 185136323.0,
+      "reward": 0.3359375,
+      "reward_std": 0.25460803508758545,
+      "rewards/accuracy_reward/mean": 0.3359375,
+      "rewards/accuracy_reward/std": 0.47417303919792175,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999524354934692,
+      "sampling/importance_sampling_ratio/min": 1.0146355634788051e-05,
+      "sampling/sampling_logp_difference/max": 11.498395919799805,
+      "sampling/sampling_logp_difference/mean": 0.01905050128698349,
+      "step": 234
+    },
+    {
+      "clip_ratio/high_max": 4.399394583742833e-06,
+      "clip_ratio/high_mean": 1.0998486459357082e-06,
+      "clip_ratio/low_mean": 1.733424267058581e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 1.8434091430208355e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14703.0,
+      "completions/mean_length": 7155.1328125,
+      "completions/mean_terminated_length": 7082.46435546875,
+      "completions/min_length": 111.0,
+      "completions/min_terminated_length": 111.0,
+      "entropy": 1.0119014978408813,
+      "epoch": 0.21619135234590617,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.002105508930981159,
+      "learning_rate": 1e-05,
+      "loss": 0.0655,
+      "num_tokens": 186071324.0,
+      "reward": 0.328125,
+      "reward_std": 0.26303553581237793,
+      "rewards/accuracy_reward/mean": 0.328125,
+      "rewards/accuracy_reward/std": 0.4713755249977112,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999904990196228,
+      "sampling/importance_sampling_ratio/min": 0.003494206117466092,
+      "sampling/sampling_logp_difference/max": 5.656649112701416,
+      "sampling/sampling_logp_difference/mean": 0.020860780030488968,
+      "step": 235
+    },
+    {
+      "clip_ratio/high_max": 1.0561529961705673e-05,
+      "clip_ratio/high_mean": 3.4390433256703545e-06,
+      "clip_ratio/low_mean": 2.8499469067355676e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.193851205196552e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 16176.0,
+      "completions/max_terminated_length": 16176.0,
+      "completions/mean_length": 7463.2421875,
+      "completions/mean_terminated_length": 7463.2421875,
+      "completions/min_length": 698.0,
+      "completions/min_terminated_length": 698.0,
+      "entropy": 0.9983502700924873,
+      "epoch": 0.21711131554737811,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0013582308311015368,
+      "learning_rate": 1e-05,
+      "loss": 0.048,
+      "num_tokens": 187045035.0,
+      "reward": 0.3984375,
+      "reward_std": 0.2517249584197998,
+      "rewards/accuracy_reward/mean": 0.3984375,
+      "rewards/accuracy_reward/std": 0.4915000796318054,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999428987503052,
+      "sampling/importance_sampling_ratio/min": 0.000473080639494583,
+      "sampling/sampling_logp_difference/max": 7.65624475479126,
+      "sampling/sampling_logp_difference/mean": 0.021131811663508415,
+      "step": 236
+    },
+    {
+      "clip_ratio/high_max": 8.509013468938065e-06,
+      "clip_ratio/high_mean": 2.127253367234516e-06,
+      "clip_ratio/low_mean": 3.985050443588989e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.197775751890731e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14938.0,
+      "completions/mean_length": 6460.984375,
+      "completions/mean_terminated_length": 6382.8505859375,
+      "completions/min_length": 1747.0,
+      "completions/min_terminated_length": 1747.0,
+      "entropy": 0.7869217246770859,
+      "epoch": 0.21803127874885003,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.002681629965081811,
+      "learning_rate": 1e-05,
+      "loss": 0.0987,
+      "num_tokens": 187889609.0,
+      "reward": 0.5234375,
+      "reward_std": 0.39082521200180054,
+      "rewards/accuracy_reward/mean": 0.5234375,
+      "rewards/accuracy_reward/std": 0.5014128684997559,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999568462371826,
+      "sampling/importance_sampling_ratio/min": 0.0015037209959700704,
+      "sampling/sampling_logp_difference/max": 6.499812602996826,
+      "sampling/sampling_logp_difference/mean": 0.016937749460339546,
+      "step": 237
+    },
+    {
+      "clip_ratio/high_max": 1.2362176221358823e-05,
+      "clip_ratio/high_mean": 3.0905440553397057e-06,
+      "clip_ratio/low_mean": 5.0333514764133724e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.342405825103924e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15893.0,
+      "completions/mean_length": 6241.78125,
+      "completions/mean_terminated_length": 6161.92138671875,
+      "completions/min_length": 548.0,
+      "completions/min_terminated_length": 548.0,
+      "entropy": 1.0217387825250626,
+      "epoch": 0.21895124195032198,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0021239183843135834,
+      "learning_rate": 1e-05,
+      "loss": 0.0353,
+      "num_tokens": 188706605.0,
+      "reward": 0.2578125,
+      "reward_std": 0.3135277330875397,
+      "rewards/accuracy_reward/mean": 0.2578125,
+      "rewards/accuracy_reward/std": 0.43914902210235596,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999796748161316,
+      "sampling/importance_sampling_ratio/min": 0.004853047896176577,
+      "sampling/sampling_logp_difference/max": 5.328148365020752,
+      "sampling/sampling_logp_difference/mean": 0.02103862166404724,
+      "step": 238
+    },
+    {
+      "clip_ratio/high_max": 6.725130333506968e-06,
+      "clip_ratio/high_mean": 1.681282583376742e-06,
+      "clip_ratio/low_mean": 3.437372129155847e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.605500387493521e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15332.0,
+      "completions/mean_length": 5638.1328125,
+      "completions/mean_terminated_length": 5553.51953125,
+      "completions/min_length": 66.0,
+      "completions/min_terminated_length": 66.0,
+      "entropy": 0.7844365313649178,
+      "epoch": 0.21987120515179392,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0023868419229984283,
+      "learning_rate": 1e-05,
+      "loss": 0.0458,
+      "num_tokens": 189446294.0,
+      "reward": 0.515625,
+      "reward_std": 0.31246688961982727,
+      "rewards/accuracy_reward/mean": 0.515625,
+      "rewards/accuracy_reward/std": 0.5017194747924805,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000369548797607,
+      "sampling/importance_sampling_ratio/min": 0.0008047468145377934,
+      "sampling/sampling_logp_difference/max": 7.124982833862305,
+      "sampling/sampling_logp_difference/mean": 0.017401430755853653,
+      "step": 239
+    },
+    {
+      "clip_ratio/high_max": 2.887730215661577e-05,
+      "clip_ratio/high_mean": 7.219325539153942e-06,
+      "clip_ratio/low_mean": 2.826443028425274e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.548375502759882e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16196.0,
+      "completions/mean_length": 6374.8046875,
+      "completions/mean_terminated_length": 6215.9287109375,
+      "completions/min_length": 722.0,
+      "completions/min_terminated_length": 722.0,
+      "entropy": 0.9472770467400551,
+      "epoch": 0.22079116835326587,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0027549315709620714,
+      "learning_rate": 1e-05,
+      "loss": 0.0627,
+      "num_tokens": 190281461.0,
+      "reward": 0.3984375,
+      "reward_std": 0.3167053163051605,
+      "rewards/accuracy_reward/mean": 0.3984375,
+      "rewards/accuracy_reward/std": 0.4915000796318054,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998682737350464,
+      "sampling/importance_sampling_ratio/min": 7.100860239006579e-05,
+      "sampling/sampling_logp_difference/max": 9.552709579467773,
+      "sampling/sampling_logp_difference/mean": 0.020243138074874878,
+      "step": 240
+    },
+    {
+      "clip_ratio/high_max": 1.586787766427733e-05,
+      "clip_ratio/high_mean": 3.9669694160693325e-06,
+      "clip_ratio/low_mean": 2.978218674343225e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.374915604581474e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15097.0,
+      "completions/mean_length": 6654.21875,
+      "completions/mean_terminated_length": 6499.88134765625,
+      "completions/min_length": 246.0,
+      "completions/min_terminated_length": 246.0,
+      "entropy": 1.0028243213891983,
+      "epoch": 0.22171113155473782,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0013344973558560014,
+      "learning_rate": 1e-05,
+      "loss": 0.0184,
+      "num_tokens": 191156249.0,
+      "reward": 0.359375,
+      "reward_std": 0.22832971811294556,
+      "rewards/accuracy_reward/mean": 0.359375,
+      "rewards/accuracy_reward/std": 0.481702595949173,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999619722366333,
+      "sampling/importance_sampling_ratio/min": 0.0021875568199902773,
+      "sampling/sampling_logp_difference/max": 6.124969959259033,
+      "sampling/sampling_logp_difference/mean": 0.020470600575208664,
+      "step": 241
+    },
+    {
+      "clip_ratio/high_max": 1.681529829511419e-05,
+      "clip_ratio/high_mean": 4.9954849146160996e-06,
+      "clip_ratio/low_mean": 2.040554932136729e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.5401033553862362e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16172.0,
+      "completions/mean_length": 6767.7890625,
+      "completions/mean_terminated_length": 6537.00048828125,
+      "completions/min_length": 132.0,
+      "completions/min_terminated_length": 132.0,
+      "entropy": 0.9059296399354935,
+      "epoch": 0.22263109475620976,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0016136945923790336,
+      "learning_rate": 1e-05,
+      "loss": 0.0816,
+      "num_tokens": 192040526.0,
+      "reward": 0.4921875,
+      "reward_std": 0.2909066081047058,
+      "rewards/accuracy_reward/mean": 0.4921875,
+      "rewards/accuracy_reward/std": 0.5019033551216125,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999668598175049,
+      "sampling/importance_sampling_ratio/min": 1.2452921509975567e-05,
+      "sampling/sampling_logp_difference/max": 11.29355525970459,
+      "sampling/sampling_logp_difference/mean": 0.020058143883943558,
+      "step": 242
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 2.9821966563758906e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.9821966563758906e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 16275.0,
+      "completions/max_terminated_length": 16275.0,
+      "completions/mean_length": 6767.4921875,
+      "completions/mean_terminated_length": 6767.4921875,
+      "completions/min_length": 998.0,
+      "completions/min_terminated_length": 998.0,
+      "entropy": 1.0446822568774223,
+      "epoch": 0.22355105795768168,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.002869367366656661,
+      "learning_rate": 1e-05,
+      "loss": 0.0212,
+      "num_tokens": 192926469.0,
+      "reward": 0.3828125,
+      "reward_std": 0.2517249882221222,
+      "rewards/accuracy_reward/mean": 0.3828125,
+      "rewards/accuracy_reward/std": 0.4879830479621887,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999586343765259,
+      "sampling/importance_sampling_ratio/min": 1.9328599591972306e-05,
+      "sampling/sampling_logp_difference/max": 10.853924751281738,
+      "sampling/sampling_logp_difference/mean": 0.021512050181627274,
+      "step": 243
+    },
+    {
+      "clip_ratio/high_max": 3.44581130775623e-05,
+      "clip_ratio/high_mean": 1.3001711295146379e-05,
+      "clip_ratio/low_mean": 3.6407937841431703e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.940964981869911e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 16261.0,
+      "completions/max_terminated_length": 16261.0,
+      "completions/mean_length": 5738.484375,
+      "completions/mean_terminated_length": 5738.484375,
+      "completions/min_length": 184.0,
+      "completions/min_terminated_length": 184.0,
+      "entropy": 0.8617956340312958,
+      "epoch": 0.22447102115915363,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.002177527640014887,
+      "learning_rate": 1e-05,
+      "loss": -0.0189,
+      "num_tokens": 193678859.0,
+      "reward": 0.5546875,
+      "reward_std": 0.33220988512039185,
+      "rewards/accuracy_reward/mean": 0.5546875,
+      "rewards/accuracy_reward/std": 0.4989531338214874,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999570846557617,
+      "sampling/importance_sampling_ratio/min": 0.0008533780346624553,
+      "sampling/sampling_logp_difference/max": 7.06630802154541,
+      "sampling/sampling_logp_difference/mean": 0.018141131848096848,
+      "step": 244
+    },
+    {
+      "clip_ratio/high_max": 3.861003733618418e-06,
+      "clip_ratio/high_mean": 9.652509334046044e-07,
+      "clip_ratio/low_mean": 2.7767115511778684e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.8732366558870126e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15595.0,
+      "completions/mean_length": 6382.90625,
+      "completions/mean_terminated_length": 5976.357421875,
+      "completions/min_length": 464.0,
+      "completions/min_terminated_length": 464.0,
+      "entropy": 0.8692388981580734,
+      "epoch": 0.22539098436062557,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.004127771593630314,
+      "learning_rate": 1e-05,
+      "loss": 0.0572,
+      "num_tokens": 194511847.0,
+      "reward": 0.4140625,
+      "reward_std": 0.2767002582550049,
+      "rewards/accuracy_reward/mean": 0.4140625,
+      "rewards/accuracy_reward/std": 0.49449479579925537,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998810291290283,
+      "sampling/importance_sampling_ratio/min": 5.4239239943854045e-06,
+      "sampling/sampling_logp_difference/max": 12.124691009521484,
+      "sampling/sampling_logp_difference/mean": 0.018376430496573448,
+      "step": 245
+    },
+    {
+      "clip_ratio/high_max": 9.728395525598899e-06,
+      "clip_ratio/high_mean": 2.4320988813997246e-06,
+      "clip_ratio/low_mean": 5.3631663831765763e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.606376271316549e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 14504.0,
+      "completions/max_terminated_length": 14504.0,
+      "completions/mean_length": 5776.15625,
+      "completions/mean_terminated_length": 5776.15625,
+      "completions/min_length": 1018.0,
+      "completions/min_terminated_length": 1018.0,
+      "entropy": 1.1195004731416702,
+      "epoch": 0.22631094756209752,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.00263008801266551,
+      "learning_rate": 1e-05,
+      "loss": 0.0687,
+      "num_tokens": 195270051.0,
+      "reward": 0.421875,
+      "reward_std": 0.3618982434272766,
+      "rewards/accuracy_reward/mean": 0.421875,
+      "rewards/accuracy_reward/std": 0.4957992732524872,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999971866607666,
+      "sampling/importance_sampling_ratio/min": 0.005209421273320913,
+      "sampling/sampling_logp_difference/max": 5.257286548614502,
+      "sampling/sampling_logp_difference/mean": 0.019923292100429535,
+      "step": 246
+    },
+    {
+      "clip_ratio/high_max": 1.2701100786216557e-05,
+      "clip_ratio/high_mean": 3.1752751965541393e-06,
+      "clip_ratio/low_mean": 4.2162768181697174e-05,
+      "clip_ratio/low_min": 3.873926743835909e-06,
+      "clip_ratio/region_mean": 4.5338043378251314e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15203.0,
+      "completions/mean_length": 7411.421875,
+      "completions/mean_terminated_length": 7196.08056640625,
+      "completions/min_length": 455.0,
+      "completions/min_terminated_length": 455.0,
+      "entropy": 0.9801053553819656,
+      "epoch": 0.22723091076356947,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.002642859937623143,
+      "learning_rate": 1e-05,
+      "loss": 0.07,
+      "num_tokens": 196240913.0,
+      "reward": 0.390625,
+      "reward_std": 0.27328529953956604,
+      "rewards/accuracy_reward/mean": 0.390625,
+      "rewards/accuracy_reward/std": 0.4898075461387634,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999198913574219,
+      "sampling/importance_sampling_ratio/min": 0.00017500204558018595,
+      "sampling/sampling_logp_difference/max": 8.650712966918945,
+      "sampling/sampling_logp_difference/mean": 0.021511007100343704,
+      "step": 247
+    },
+    {
+      "clip_ratio/high_max": 1.5122936929401476e-05,
+      "clip_ratio/high_mean": 3.780734232350369e-06,
+      "clip_ratio/low_mean": 6.367217611114029e-05,
+      "clip_ratio/low_min": 4.8010447244450916e-06,
+      "clip_ratio/region_mean": 6.745291057086433e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16127.0,
+      "completions/mean_length": 7944.65625,
+      "completions/mean_terminated_length": 7742.1123046875,
+      "completions/min_length": 144.0,
+      "completions/min_terminated_length": 144.0,
+      "entropy": 1.0132562816143036,
+      "epoch": 0.2281508739650414,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.002439325675368309,
+      "learning_rate": 1e-05,
+      "loss": 0.0564,
+      "num_tokens": 197278517.0,
+      "reward": 0.34375,
+      "reward_std": 0.3161812424659729,
+      "rewards/accuracy_reward/mean": 0.34375,
+      "rewards/accuracy_reward/std": 0.47682511806488037,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999248385429382,
+      "sampling/importance_sampling_ratio/min": 1.0140610356756952e-05,
+      "sampling/sampling_logp_difference/max": 11.49896240234375,
+      "sampling/sampling_logp_difference/mean": 0.02124868705868721,
+      "step": 248
+    },
+    {
+      "clip_ratio/high_max": 2.6017536356448545e-05,
+      "clip_ratio/high_mean": 6.504384089112136e-06,
+      "clip_ratio/low_mean": 3.7791321346958284e-05,
+      "clip_ratio/low_min": 3.2110563097376144e-06,
+      "clip_ratio/region_mean": 4.429570503816649e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16317.0,
+      "completions/mean_length": 7550.0,
+      "completions/mean_terminated_length": 7409.7783203125,
+      "completions/min_length": 1469.0,
+      "completions/min_terminated_length": 1469.0,
+      "entropy": 1.0384011715650558,
+      "epoch": 0.22907083716651333,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0014879995724186301,
+      "learning_rate": 1e-05,
+      "loss": 0.0338,
+      "num_tokens": 198265589.0,
+      "reward": 0.3359375,
+      "reward_std": 0.24040167033672333,
+      "rewards/accuracy_reward/mean": 0.3359375,
+      "rewards/accuracy_reward/std": 0.47417303919792175,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999468922615051,
+      "sampling/importance_sampling_ratio/min": 8.418659126618877e-05,
+      "sampling/sampling_logp_difference/max": 9.382474899291992,
+      "sampling/sampling_logp_difference/mean": 0.021503347903490067,
+      "step": 249
+    },
+    {
+      "clip_ratio/high_max": 1.3615457191917812e-05,
+      "clip_ratio/high_mean": 4.491880531531933e-06,
+      "clip_ratio/low_mean": 3.916533574965797e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.365721684962409e-05,
+      "completions/clipped_ratio": 0.0703125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16221.0,
+      "completions/mean_length": 8140.9140625,
+      "completions/mean_terminated_length": 7517.48779296875,
+      "completions/min_length": 837.0,
+      "completions/min_terminated_length": 837.0,
+      "entropy": 0.8718572407960892,
+      "epoch": 0.22999080036798528,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.002340668346732855,
+      "learning_rate": 1e-05,
+      "loss": 0.0585,
+      "num_tokens": 199324938.0,
+      "reward": 0.453125,
+      "reward_std": 0.35824596881866455,
+      "rewards/accuracy_reward/mean": 0.453125,
+      "rewards/accuracy_reward/std": 0.4997538626194,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999454021453857,
+      "sampling/importance_sampling_ratio/min": 0.002325017238035798,
+      "sampling/sampling_logp_difference/max": 6.064027786254883,
+      "sampling/sampling_logp_difference/mean": 0.019466478377580643,
+      "step": 250
+    },
+    {
+      "clip_ratio/high_max": 2.2175697040438536e-05,
+      "clip_ratio/high_mean": 5.543924260109634e-06,
+      "clip_ratio/low_mean": 4.1318608055007644e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.686253225827386e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16263.0,
+      "completions/mean_length": 6630.96875,
+      "completions/mean_terminated_length": 6396.896484375,
+      "completions/min_length": 116.0,
+      "completions/min_terminated_length": 116.0,
+      "entropy": 0.7798146530985832,
+      "epoch": 0.23091076356945722,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.001989356242120266,
+      "learning_rate": 1e-05,
+      "loss": 0.0218,
+      "num_tokens": 200189902.0,
+      "reward": 0.5625,
+      "reward_std": 0.2987973093986511,
+      "rewards/accuracy_reward/mean": 0.5625,
+      "rewards/accuracy_reward/std": 0.49802759289741516,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999474883079529,
+      "sampling/importance_sampling_ratio/min": 0.0003315774374641478,
+      "sampling/sampling_logp_difference/max": 8.011649131774902,
+      "sampling/sampling_logp_difference/mean": 0.01849902793765068,
+      "step": 251
+    },
+    {
+      "clip_ratio/high_max": 3.325706302348408e-06,
+      "clip_ratio/high_mean": 8.31426575587102e-07,
+      "clip_ratio/low_mean": 2.0285911205064622e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.111733795118198e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15357.0,
+      "completions/max_terminated_length": 15357.0,
+      "completions/mean_length": 6582.203125,
+      "completions/mean_terminated_length": 6582.203125,
+      "completions/min_length": 593.0,
+      "completions/min_terminated_length": 593.0,
+      "entropy": 1.0181676000356674,
+      "epoch": 0.23183072677092917,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.002594445599243045,
+      "learning_rate": 1e-05,
+      "loss": 0.0232,
+      "num_tokens": 201052832.0,
+      "reward": 0.34375,
+      "reward_std": 0.25460314750671387,
+      "rewards/accuracy_reward/mean": 0.34375,
+      "rewards/accuracy_reward/std": 0.47682511806488037,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999495148658752,
+      "sampling/importance_sampling_ratio/min": 0.0003853558446280658,
+      "sampling/sampling_logp_difference/max": 7.8613433837890625,
+      "sampling/sampling_logp_difference/mean": 0.021598614752292633,
+      "step": 252
+    },
+    {
+      "clip_ratio/high_max": 2.2044430352252675e-05,
+      "clip_ratio/high_mean": 5.511107588063169e-06,
+      "clip_ratio/low_mean": 3.4155824209847196e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.96669319115972e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 14540.0,
+      "completions/max_terminated_length": 14540.0,
+      "completions/mean_length": 6145.1796875,
+      "completions/mean_terminated_length": 6145.1796875,
+      "completions/min_length": 1098.0,
+      "completions/min_terminated_length": 1098.0,
+      "entropy": 0.9084350541234016,
+      "epoch": 0.23275068997240111,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.003104996867477894,
+      "learning_rate": 1e-05,
+      "loss": 0.0811,
+      "num_tokens": 201858047.0,
+      "reward": 0.5078125,
+      "reward_std": 0.33220985531806946,
+      "rewards/accuracy_reward/mean": 0.5078125,
+      "rewards/accuracy_reward/std": 0.5019033551216125,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.000011682510376,
+      "sampling/importance_sampling_ratio/min": 0.007650630082935095,
+      "sampling/sampling_logp_difference/max": 4.87296724319458,
+      "sampling/sampling_logp_difference/mean": 0.018979094922542572,
+      "step": 253
+    },
+    {
+      "clip_ratio/high_max": 2.9959978519400465e-05,
+      "clip_ratio/high_mean": 7.489994629850116e-06,
+      "clip_ratio/low_mean": 3.5255963325653283e-05,
+      "clip_ratio/low_min": 2.973075879708631e-06,
+      "clip_ratio/region_mean": 4.274595892184152e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15745.0,
+      "completions/max_terminated_length": 15745.0,
+      "completions/mean_length": 7259.953125,
+      "completions/mean_terminated_length": 7259.953125,
+      "completions/min_length": 960.0,
+      "completions/min_terminated_length": 960.0,
+      "entropy": 0.9823614731431007,
+      "epoch": 0.23367065317387303,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.003212577663362026,
+      "learning_rate": 1e-05,
+      "loss": 0.0133,
+      "num_tokens": 202807673.0,
+      "reward": 0.4765625,
+      "reward_std": 0.3056321144104004,
+      "rewards/accuracy_reward/mean": 0.4765625,
+      "rewards/accuracy_reward/std": 0.5014128684997559,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999860405921936,
+      "sampling/importance_sampling_ratio/min": 0.000536504783667624,
+      "sampling/sampling_logp_difference/max": 7.530435085296631,
+      "sampling/sampling_logp_difference/mean": 0.021432969719171524,
+      "step": 254
+    },
+    {
+      "clip_ratio/high_max": 3.273996276220714e-05,
+      "clip_ratio/high_mean": 9.095591565255745e-06,
+      "clip_ratio/low_mean": 2.9539680099333054e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.8635271948805894e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16369.0,
+      "completions/mean_length": 7258.71875,
+      "completions/mean_terminated_length": 7113.87353515625,
+      "completions/min_length": 4.0,
+      "completions/min_terminated_length": 4.0,
+      "entropy": 0.8823810070753098,
+      "epoch": 0.23459061637534498,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.001418307889252901,
+      "learning_rate": 1e-05,
+      "loss": 0.0411,
+      "num_tokens": 203757333.0,
+      "reward": 0.40625,
+      "reward_std": 0.3048579692840576,
+      "rewards/accuracy_reward/mean": 0.40625,
+      "rewards/accuracy_reward/std": 0.4930621087551117,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999884963035583,
+      "sampling/importance_sampling_ratio/min": 0.0006408974295482039,
+      "sampling/sampling_logp_difference/max": 7.3526411056518555,
+      "sampling/sampling_logp_difference/mean": 0.019296500831842422,
+      "step": 255
+    },
+    {
+      "clip_ratio/high_max": 1.544119368190877e-05,
+      "clip_ratio/high_mean": 3.860298420477193e-06,
+      "clip_ratio/low_mean": 3.755458698151415e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.141488631148604e-05,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15605.0,
+      "completions/mean_length": 7011.40625,
+      "completions/mean_terminated_length": 6386.56689453125,
+      "completions/min_length": 685.0,
+      "completions/min_terminated_length": 685.0,
+      "entropy": 0.8057166337966919,
+      "epoch": 0.23551057957681693,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.001652427832596004,
+      "learning_rate": 1e-05,
+      "loss": 0.0459,
+      "num_tokens": 204675065.0,
+      "reward": 0.46875,
+      "reward_std": 0.24146251380443573,
+      "rewards/accuracy_reward/mean": 0.46875,
+      "rewards/accuracy_reward/std": 0.5009832978248596,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999918937683105,
+      "sampling/importance_sampling_ratio/min": 0.015319154597818851,
+      "sampling/sampling_logp_difference/max": 4.178651332855225,
+      "sampling/sampling_logp_difference/mean": 0.018787402659654617,
+      "step": 256
+    },
+    {
+      "clip_ratio/high_max": 5.222041181696113e-06,
+      "clip_ratio/high_mean": 2.209917965956265e-06,
+      "clip_ratio/low_mean": 4.0701652551433654e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.291157006264257e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 14796.0,
+      "completions/max_terminated_length": 14796.0,
+      "completions/mean_length": 6243.4296875,
+      "completions/mean_terminated_length": 6243.4296875,
+      "completions/min_length": 1023.0,
+      "completions/min_terminated_length": 1023.0,
+      "entropy": 0.9856048971414566,
+      "epoch": 0.23643054277828887,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.001482579973526299,
+      "learning_rate": 1e-05,
+      "loss": 0.0677,
+      "num_tokens": 205494344.0,
+      "reward": 0.5390625,
+      "reward_std": 0.28930407762527466,
+      "rewards/accuracy_reward/mean": 0.5390625,
+      "rewards/accuracy_reward/std": 0.5004304051399231,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998942613601685,
+      "sampling/importance_sampling_ratio/min": 0.0004254466330166906,
+      "sampling/sampling_logp_difference/max": 7.762371063232422,
+      "sampling/sampling_logp_difference/mean": 0.019727632403373718,
+      "step": 257
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 6.842733455414418e-05,
+      "clip_ratio/low_min": 9.297655878981459e-06,
+      "clip_ratio/region_mean": 6.842733455414418e-05,
+      "completions/clipped_ratio": 0.0546875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15485.0,
+      "completions/mean_length": 7122.2421875,
+      "completions/mean_terminated_length": 6586.4375,
+      "completions/min_length": 445.0,
+      "completions/min_terminated_length": 445.0,
+      "entropy": 0.8625433370471001,
+      "epoch": 0.23735050597976082,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.002006452763453126,
+      "learning_rate": 1e-05,
+      "loss": 0.0312,
+      "num_tokens": 206428775.0,
+      "reward": 0.40625,
+      "reward_std": 0.2987973093986511,
+      "rewards/accuracy_reward/mean": 0.40625,
+      "rewards/accuracy_reward/std": 0.4930621087551117,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999338388442993,
+      "sampling/importance_sampling_ratio/min": 0.00010911409481195733,
+      "sampling/sampling_logp_difference/max": 9.123116493225098,
+      "sampling/sampling_logp_difference/mean": 0.01927522011101246,
+      "step": 258
+    },
+    {
+      "clip_ratio/high_max": 2.887607206503162e-05,
+      "clip_ratio/high_mean": 7.219018016257905e-06,
+      "clip_ratio/low_mean": 2.7790995090981596e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.501001378936053e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15254.0,
+      "completions/mean_length": 7965.2734375,
+      "completions/mean_terminated_length": 7623.6826171875,
+      "completions/min_length": 399.0,
+      "completions/min_terminated_length": 399.0,
+      "entropy": 1.0068430602550507,
+      "epoch": 0.23827046918123276,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0029176415409892797,
+      "learning_rate": 1e-05,
+      "loss": 0.0179,
+      "num_tokens": 207469586.0,
+      "reward": 0.3828125,
+      "reward_std": 0.2212003916501999,
+      "rewards/accuracy_reward/mean": 0.3828125,
+      "rewards/accuracy_reward/std": 0.4879830479621887,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998945593833923,
+      "sampling/importance_sampling_ratio/min": 4.06005028708023e-06,
+      "sampling/sampling_logp_difference/max": 12.414315223693848,
+      "sampling/sampling_logp_difference/mean": 0.02198987640440464,
+      "step": 259
+    },
+    {
+      "clip_ratio/high_max": 8.710998599781306e-06,
+      "clip_ratio/high_mean": 2.1777496499453264e-06,
+      "clip_ratio/low_mean": 4.1899779091636447e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.407752874158177e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15331.0,
+      "completions/mean_length": 6329.4296875,
+      "completions/mean_terminated_length": 6169.83349609375,
+      "completions/min_length": 160.0,
+      "completions/min_terminated_length": 160.0,
+      "entropy": 0.9399363100528717,
+      "epoch": 0.23919043238270468,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0019115234026685357,
+      "learning_rate": 1e-05,
+      "loss": 0.0399,
+      "num_tokens": 208300217.0,
+      "reward": 0.4375,
+      "reward_std": 0.2580229938030243,
+      "rewards/accuracy_reward/mean": 0.4375,
+      "rewards/accuracy_reward/std": 0.49802759289741516,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000567436218262,
+      "sampling/importance_sampling_ratio/min": 2.1449603082146496e-05,
+      "sampling/sampling_logp_difference/max": 10.749804496765137,
+      "sampling/sampling_logp_difference/mean": 0.020002204924821854,
+      "step": 260
+    },
+    {
+      "clip_ratio/high_max": 2.536784450057894e-05,
+      "clip_ratio/high_mean": 6.341961125144735e-06,
+      "clip_ratio/low_mean": 5.959111433639919e-05,
+      "clip_ratio/low_min": 1.1521060741870315e-05,
+      "clip_ratio/region_mean": 6.593307591629127e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15635.0,
+      "completions/mean_length": 6747.90625,
+      "completions/mean_terminated_length": 6594.95263671875,
+      "completions/min_length": 350.0,
+      "completions/min_terminated_length": 350.0,
+      "entropy": 0.9575144425034523,
+      "epoch": 0.24011039558417663,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.003766207257285714,
+      "learning_rate": 1e-05,
+      "loss": 0.0667,
+      "num_tokens": 209181077.0,
+      "reward": 0.4375,
+      "reward_std": 0.3164137303829193,
+      "rewards/accuracy_reward/mean": 0.4375,
+      "rewards/accuracy_reward/std": 0.49802759289741516,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999313354492188,
+      "sampling/importance_sampling_ratio/min": 1.250743298442103e-05,
+      "sampling/sampling_logp_difference/max": 11.28918743133545,
+      "sampling/sampling_logp_difference/mean": 0.020067427307367325,
+      "step": 261
+    },
+    {
+      "clip_ratio/high_max": 2.0626074274332495e-05,
+      "clip_ratio/high_mean": 5.156518568583124e-06,
+      "clip_ratio/low_mean": 5.808068385704246e-05,
+      "clip_ratio/low_min": 1.0360539818066172e-05,
+      "clip_ratio/region_mean": 6.32372018571914e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16218.0,
+      "completions/mean_length": 6426.6953125,
+      "completions/mean_terminated_length": 6348.29150390625,
+      "completions/min_length": 767.0,
+      "completions/min_terminated_length": 767.0,
+      "entropy": 0.87480478733778,
+      "epoch": 0.24103035878564857,
+      "frac_reward_zero_std": 0.125,
+      "grad_norm": 0.002375675830990076,
+      "learning_rate": 1e-05,
+      "loss": 0.0752,
+      "num_tokens": 210023702.0,
+      "reward": 0.5078125,
+      "reward_std": 0.38900789618492126,
+      "rewards/accuracy_reward/mean": 0.5078125,
+      "rewards/accuracy_reward/std": 0.5019033551216125,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999383687973022,
+      "sampling/importance_sampling_ratio/min": 0.00024259372730739415,
+      "sampling/sampling_logp_difference/max": 8.324122428894043,
+      "sampling/sampling_logp_difference/mean": 0.018864646553993225,
+      "step": 262
+    },
+    {
+      "clip_ratio/high_max": 4.462851393327583e-06,
+      "clip_ratio/high_mean": 1.1157128483318957e-06,
+      "clip_ratio/low_mean": 3.8966268334661436e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.008198141036701e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16022.0,
+      "completions/mean_length": 7223.1484375,
+      "completions/mean_terminated_length": 6927.63671875,
+      "completions/min_length": 1015.0,
+      "completions/min_terminated_length": 1015.0,
+      "entropy": 1.0218688547611237,
+      "epoch": 0.24195032198712052,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0016257674433290958,
+      "learning_rate": 1e-05,
+      "loss": 0.0791,
+      "num_tokens": 210969921.0,
+      "reward": 0.4609375,
+      "reward_std": 0.2896084189414978,
+      "rewards/accuracy_reward/mean": 0.4609375,
+      "rewards/accuracy_reward/std": 0.5004304051399231,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999514818191528,
+      "sampling/importance_sampling_ratio/min": 9.193710138788447e-05,
+      "sampling/sampling_logp_difference/max": 9.294405937194824,
+      "sampling/sampling_logp_difference/mean": 0.02119653858244419,
+      "step": 263
+    },
+    {
+      "clip_ratio/high_max": 1.2653464409595472e-05,
+      "clip_ratio/high_mean": 3.163366102398868e-06,
+      "clip_ratio/low_mean": 4.864477250521304e-05,
+      "clip_ratio/low_min": 8.641252861707471e-06,
+      "clip_ratio/region_mean": 5.1808138323394815e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15180.0,
+      "completions/max_terminated_length": 15180.0,
+      "completions/mean_length": 6974.0703125,
+      "completions/mean_terminated_length": 6974.0703125,
+      "completions/min_length": 2.0,
+      "completions/min_terminated_length": 2.0,
+      "entropy": 0.9265539348125458,
+      "epoch": 0.24287028518859247,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0023448490537703037,
+      "learning_rate": 1e-05,
+      "loss": 0.0567,
+      "num_tokens": 211884866.0,
+      "reward": 0.390625,
+      "reward_std": 0.2885475754737854,
+      "rewards/accuracy_reward/mean": 0.390625,
+      "rewards/accuracy_reward/std": 0.4898075461387634,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000474452972412,
+      "sampling/importance_sampling_ratio/min": 0.0007677432149648666,
+      "sampling/sampling_logp_difference/max": 7.172055244445801,
+      "sampling/sampling_logp_difference/mean": 0.020384611561894417,
+      "step": 264
+    },
+    {
+      "clip_ratio/high_max": 1.1967917316724197e-05,
+      "clip_ratio/high_mean": 2.9919793291810493e-06,
+      "clip_ratio/low_mean": 3.179497366545547e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.478695157355105e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15657.0,
+      "completions/mean_length": 7247.2734375,
+      "completions/mean_terminated_length": 7027.9921875,
+      "completions/min_length": 103.0,
+      "completions/min_terminated_length": 103.0,
+      "entropy": 0.9756898358464241,
+      "epoch": 0.24379024839006438,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.003212807234376669,
+      "learning_rate": 1e-05,
+      "loss": 0.0484,
+      "num_tokens": 212833933.0,
+      "reward": 0.328125,
+      "reward_std": 0.2398776412010193,
+      "rewards/accuracy_reward/mean": 0.328125,
+      "rewards/accuracy_reward/std": 0.4713755249977112,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999449253082275,
+      "sampling/importance_sampling_ratio/min": 0.001600456889718771,
+      "sampling/sampling_logp_difference/max": 6.437466144561768,
+      "sampling/sampling_logp_difference/mean": 0.0199666079133749,
+      "step": 265
+    },
+    {
+      "clip_ratio/high_max": 1.1404694760130951e-05,
+      "clip_ratio/high_mean": 3.887520392709121e-06,
+      "clip_ratio/low_mean": 4.0242122167910566e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.4129643583801226e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15575.0,
+      "completions/mean_length": 7773.9296875,
+      "completions/mean_terminated_length": 7423.9267578125,
+      "completions/min_length": 568.0,
+      "completions/min_terminated_length": 568.0,
+      "entropy": 0.9765531942248344,
+      "epoch": 0.24471021159153633,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0019600428640842438,
+      "learning_rate": 1e-05,
+      "loss": 0.0357,
+      "num_tokens": 213848508.0,
+      "reward": 0.3984375,
+      "reward_std": 0.3129909336566925,
+      "rewards/accuracy_reward/mean": 0.3984375,
+      "rewards/accuracy_reward/std": 0.4915000796318054,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000061988830566,
+      "sampling/importance_sampling_ratio/min": 2.430168751743622e-05,
+      "sampling/sampling_logp_difference/max": 10.624964714050293,
+      "sampling/sampling_logp_difference/mean": 0.020565161481499672,
+      "step": 266
+    },
+    {
+      "clip_ratio/high_max": 6.725708999510971e-06,
+      "clip_ratio/high_mean": 1.6814272498777427e-06,
+      "clip_ratio/low_mean": 2.869901106805628e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.0380438261090603e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15583.0,
+      "completions/mean_length": 6722.5,
+      "completions/mean_terminated_length": 6569.14306640625,
+      "completions/min_length": 1021.0,
+      "completions/min_terminated_length": 1021.0,
+      "entropy": 0.9291529878973961,
+      "epoch": 0.24563017479300828,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.0014550165506079793,
+      "learning_rate": 1e-05,
+      "loss": 0.0235,
+      "num_tokens": 214731180.0,
+      "reward": 0.4921875,
+      "reward_std": 0.19332444667816162,
+      "rewards/accuracy_reward/mean": 0.4921875,
+      "rewards/accuracy_reward/std": 0.5019033551216125,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999178647994995,
+      "sampling/importance_sampling_ratio/min": 0.007400285452604294,
+      "sampling/sampling_logp_difference/max": 4.90623664855957,
+      "sampling/sampling_logp_difference/mean": 0.020057080313563347,
+      "step": 267
+    },
+    {
+      "clip_ratio/high_max": 1.8797170469042612e-05,
+      "clip_ratio/high_mean": 6.827749643889547e-06,
+      "clip_ratio/low_mean": 3.448591337473772e-05,
+      "clip_ratio/low_min": 4.687090040533803e-06,
+      "clip_ratio/region_mean": 4.1313662677566754e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15797.0,
+      "completions/max_terminated_length": 15797.0,
+      "completions/mean_length": 7001.8671875,
+      "completions/mean_terminated_length": 7001.8671875,
+      "completions/min_length": 930.0,
+      "completions/min_terminated_length": 930.0,
+      "entropy": 1.0746883526444435,
+      "epoch": 0.24655013799448022,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.002483292715623975,
+      "learning_rate": 1e-05,
+      "loss": 0.048,
+      "num_tokens": 215645819.0,
+      "reward": 0.3515625,
+      "reward_std": 0.32955142855644226,
+      "rewards/accuracy_reward/mean": 0.3515625,
+      "rewards/accuracy_reward/std": 0.4793342351913452,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999587535858154,
+      "sampling/importance_sampling_ratio/min": 1.0195622053288389e-05,
+      "sampling/sampling_logp_difference/max": 11.493552207946777,
+      "sampling/sampling_logp_difference/mean": 0.020808640867471695,
+      "step": 268
+    },
+    {
+      "clip_ratio/high_max": 8.748068921704544e-06,
+      "clip_ratio/high_mean": 2.187017230426136e-06,
+      "clip_ratio/low_mean": 8.762007928453386e-05,
+      "clip_ratio/low_min": 2.3698836685071e-05,
+      "clip_ratio/region_mean": 8.980709480965743e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14189.0,
+      "completions/mean_length": 6663.796875,
+      "completions/mean_terminated_length": 6509.50830078125,
+      "completions/min_length": 1148.0,
+      "completions/min_terminated_length": 1148.0,
+      "entropy": 1.0000900849699974,
+      "epoch": 0.24747010119595217,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.0015696679474785924,
+      "learning_rate": 1e-05,
+      "loss": 0.0731,
+      "num_tokens": 216519369.0,
+      "reward": 0.3671875,
+      "reward_std": 0.3214311897754669,
+      "rewards/accuracy_reward/mean": 0.3671875,
+      "rewards/accuracy_reward/std": 0.4839322865009308,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9997707605361938,
+      "sampling/importance_sampling_ratio/min": 1.288027192458685e-06,
+      "sampling/sampling_logp_difference/max": 13.562398910522461,
+      "sampling/sampling_logp_difference/mean": 0.022182684391736984,
+      "step": 269
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 4.567897690321843e-05,
+      "clip_ratio/low_min": 3.287224444648018e-06,
+      "clip_ratio/region_mean": 4.567897690321843e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16381.0,
+      "completions/mean_length": 6978.7421875,
+      "completions/mean_terminated_length": 6829.45263671875,
+      "completions/min_length": 1661.0,
+      "completions/min_terminated_length": 1661.0,
+      "entropy": 1.0845019966363907,
+      "epoch": 0.24839006439742412,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.003887100610882044,
+      "learning_rate": 1e-05,
+      "loss": 0.1076,
+      "num_tokens": 217432432.0,
+      "reward": 0.3671875,
+      "reward_std": 0.3124619722366333,
+      "rewards/accuracy_reward/mean": 0.3671875,
+      "rewards/accuracy_reward/std": 0.4839322865009308,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999902248382568,
+      "sampling/importance_sampling_ratio/min": 0.02168075367808342,
+      "sampling/sampling_logp_difference/max": 3.8313302993774414,
+      "sampling/sampling_logp_difference/mean": 0.02127157337963581,
+      "step": 270
+    },
+    {
+      "clip_ratio/high_max": 2.444328310957644e-05,
+      "clip_ratio/high_mean": 6.11082077739411e-06,
+      "clip_ratio/low_mean": 5.1527222922231886e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.7638043699625996e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15638.0,
+      "completions/mean_length": 5903.5546875,
+      "completions/mean_terminated_length": 5652.0244140625,
+      "completions/min_length": 651.0,
+      "completions/min_terminated_length": 651.0,
+      "entropy": 0.8638224303722382,
+      "epoch": 0.24931002759889603,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.002851828932762146,
+      "learning_rate": 1e-05,
+      "loss": 0.0771,
+      "num_tokens": 218208399.0,
+      "reward": 0.4453125,
+      "reward_std": 0.3713914752006531,
+      "rewards/accuracy_reward/mean": 0.4453125,
+      "rewards/accuracy_reward/std": 0.4989531338214874,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000553131103516,
+      "sampling/importance_sampling_ratio/min": 0.000626727007329464,
+      "sampling/sampling_logp_difference/max": 7.374999523162842,
+      "sampling/sampling_logp_difference/mean": 0.01880766451358795,
+      "step": 271
+    },
+    {
+      "clip_ratio/high_max": 8.474872856822913e-06,
+      "clip_ratio/high_mean": 2.118718214205728e-06,
+      "clip_ratio/low_mean": 2.5821682072546537e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.794040096887329e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 16060.0,
+      "completions/max_terminated_length": 16060.0,
+      "completions/mean_length": 5596.7109375,
+      "completions/mean_terminated_length": 5596.7109375,
+      "completions/min_length": 87.0,
+      "completions/min_terminated_length": 87.0,
+      "entropy": 1.1127397641539574,
+      "epoch": 0.250229990800368,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0018005800666287541,
+      "learning_rate": 1e-05,
+      "loss": 0.0075,
+      "num_tokens": 218944418.0,
+      "reward": 0.4375,
+      "reward_std": 0.29485049843788147,
+      "rewards/accuracy_reward/mean": 0.4375,
+      "rewards/accuracy_reward/std": 0.49802759289741516,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000048875808716,
+      "sampling/importance_sampling_ratio/min": 0.01548748929053545,
+      "sampling/sampling_logp_difference/max": 4.167722702026367,
+      "sampling/sampling_logp_difference/mean": 0.02004322223365307,
+      "step": 272
+    },
+    {
+      "clip_ratio/high_max": 1.5034628631838132e-05,
+      "clip_ratio/high_mean": 4.925485768580984e-06,
+      "clip_ratio/low_mean": 3.539464648838475e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.032013237065257e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16362.0,
+      "completions/mean_length": 7120.109375,
+      "completions/mean_terminated_length": 7047.16552734375,
+      "completions/min_length": 816.0,
+      "completions/min_terminated_length": 816.0,
+      "entropy": 1.0697019025683403,
+      "epoch": 0.2511499540018399,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0022711476776748896,
+      "learning_rate": 1e-05,
+      "loss": 0.0126,
+      "num_tokens": 219875952.0,
+      "reward": 0.2734375,
+      "reward_std": 0.23751862347126007,
+      "rewards/accuracy_reward/mean": 0.2734375,
+      "rewards/accuracy_reward/std": 0.447474867105484,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.000036358833313,
+      "sampling/importance_sampling_ratio/min": 9.733050683280453e-05,
+      "sampling/sampling_logp_difference/max": 9.237398147583008,
+      "sampling/sampling_logp_difference/mean": 0.02110595628619194,
+      "step": 273
+    },
+    {
+      "clip_ratio/high_max": 1.0558468147792155e-05,
+      "clip_ratio/high_mean": 2.6396170369480387e-06,
+      "clip_ratio/low_mean": 3.796903268948881e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.060864915800266e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15745.0,
+      "completions/mean_length": 7623.953125,
+      "completions/mean_terminated_length": 7484.9052734375,
+      "completions/min_length": 372.0,
+      "completions/min_terminated_length": 372.0,
+      "entropy": 0.8836525157094002,
+      "epoch": 0.25206991720331184,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.002609838731586933,
+      "learning_rate": 1e-05,
+      "loss": 0.0563,
+      "num_tokens": 220871730.0,
+      "reward": 0.3046875,
+      "reward_std": 0.30061954259872437,
+      "rewards/accuracy_reward/mean": 0.3046875,
+      "rewards/accuracy_reward/std": 0.46208351850509644,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999877214431763,
+      "sampling/importance_sampling_ratio/min": 0.0015448236372321844,
+      "sampling/sampling_logp_difference/max": 6.472845554351807,
+      "sampling/sampling_logp_difference/mean": 0.019322458654642105,
+      "step": 274
+    },
+    {
+      "clip_ratio/high_max": 1.144785210271948e-05,
+      "clip_ratio/high_mean": 2.86196302567987e-06,
+      "clip_ratio/low_mean": 5.795533934360719e-05,
+      "clip_ratio/low_min": 4.49300887339632e-06,
+      "clip_ratio/region_mean": 6.081730361984228e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15557.0,
+      "completions/mean_length": 6778.71875,
+      "completions/mean_terminated_length": 6703.08642578125,
+      "completions/min_length": 1187.0,
+      "completions/min_terminated_length": 1187.0,
+      "entropy": 0.8968989998102188,
+      "epoch": 0.2529898804047838,
+      "frac_reward_zero_std": 0.0625,
+      "grad_norm": 0.00395589042454958,
+      "learning_rate": 1e-05,
+      "loss": 0.0538,
+      "num_tokens": 221761214.0,
+      "reward": 0.4921875,
+      "reward_std": 0.4032142758369446,
+      "rewards/accuracy_reward/mean": 0.4921875,
+      "rewards/accuracy_reward/std": 0.5019033551216125,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000214576721191,
+      "sampling/importance_sampling_ratio/min": 0.0011724763317033648,
+      "sampling/sampling_logp_difference/max": 6.7486371994018555,
+      "sampling/sampling_logp_difference/mean": 0.018937086686491966,
+      "step": 275
+    },
+    {
+      "clip_ratio/high_max": 2.708495139813749e-05,
+      "clip_ratio/high_mean": 7.628764933542698e-06,
+      "clip_ratio/low_mean": 3.0297362627607072e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.792612744746293e-05,
+      "completions/clipped_ratio": 0.0546875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16100.0,
+      "completions/mean_length": 7319.2578125,
+      "completions/mean_terminated_length": 6794.85107421875,
+      "completions/min_length": 1034.0,
+      "completions/min_terminated_length": 1034.0,
+      "entropy": 0.870811752974987,
+      "epoch": 0.25390984360625574,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.002063714899122715,
+      "learning_rate": 1e-05,
+      "loss": 0.0271,
+      "num_tokens": 222719287.0,
+      "reward": 0.3203125,
+      "reward_std": 0.2835301160812378,
+      "rewards/accuracy_reward/mean": 0.3203125,
+      "rewards/accuracy_reward/std": 0.4684300124645233,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999525547027588,
+      "sampling/importance_sampling_ratio/min": 2.13631665246794e-05,
+      "sampling/sampling_logp_difference/max": 10.7538423538208,
+      "sampling/sampling_logp_difference/mean": 0.019336167722940445,
+      "step": 276
+    },
+    {
+      "clip_ratio/high_max": 3.860288416035473e-06,
+      "clip_ratio/high_mean": 9.650721040088683e-07,
+      "clip_ratio/low_mean": 2.303871349340625e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.4003785597415117e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16326.0,
+      "completions/mean_length": 6207.4140625,
+      "completions/mean_terminated_length": 5879.13671875,
+      "completions/min_length": 752.0,
+      "completions/min_terminated_length": 752.0,
+      "entropy": 0.8348869979381561,
+      "epoch": 0.2548298068077277,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0023463829420506954,
+      "learning_rate": 1e-05,
+      "loss": 0.0696,
+      "num_tokens": 223533372.0,
+      "reward": 0.4375,
+      "reward_std": 0.2359210103750229,
+      "rewards/accuracy_reward/mean": 0.4375,
+      "rewards/accuracy_reward/std": 0.49802759289741516,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000433921813965,
+      "sampling/importance_sampling_ratio/min": 2.1447433027788065e-05,
+      "sampling/sampling_logp_difference/max": 10.749905586242676,
+      "sampling/sampling_logp_difference/mean": 0.018392907455563545,
+      "step": 277
+    },
+    {
+      "clip_ratio/high_max": 2.1441665467136772e-05,
+      "clip_ratio/high_mean": 5.360416366784193e-06,
+      "clip_ratio/low_mean": 5.504566888703266e-05,
+      "clip_ratio/low_min": 1.2581466762640048e-05,
+      "clip_ratio/region_mean": 6.040608514013002e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 14713.0,
+      "completions/max_terminated_length": 14713.0,
+      "completions/mean_length": 6417.2109375,
+      "completions/mean_terminated_length": 6417.2109375,
+      "completions/min_length": 981.0,
+      "completions/min_terminated_length": 981.0,
+      "entropy": 1.0232173576951027,
+      "epoch": 0.25574977000919963,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0033652919810265303,
+      "learning_rate": 1e-05,
+      "loss": 0.034,
+      "num_tokens": 224375711.0,
+      "reward": 0.390625,
+      "reward_std": 0.3169426918029785,
+      "rewards/accuracy_reward/mean": 0.390625,
+      "rewards/accuracy_reward/std": 0.4898075461387634,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999820590019226,
+      "sampling/importance_sampling_ratio/min": 0.0020559614058583975,
+      "sampling/sampling_logp_difference/max": 6.18701171875,
+      "sampling/sampling_logp_difference/mean": 0.020980924367904663,
+      "step": 278
+    },
+    {
+      "clip_ratio/high_max": 4.679544872487895e-06,
+      "clip_ratio/high_mean": 1.1698862181219738e-06,
+      "clip_ratio/low_mean": 2.818696702888701e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.9356853247008985e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15026.0,
+      "completions/max_terminated_length": 15026.0,
+      "completions/mean_length": 5275.9453125,
+      "completions/mean_terminated_length": 5275.9453125,
+      "completions/min_length": 473.0,
+      "completions/min_terminated_length": 473.0,
+      "entropy": 0.8563915193080902,
+      "epoch": 0.25666973321067155,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0025473968125879765,
+      "learning_rate": 1e-05,
+      "loss": 0.0813,
+      "num_tokens": 225070992.0,
+      "reward": 0.703125,
+      "reward_std": 0.2790592610836029,
+      "rewards/accuracy_reward/mean": 0.703125,
+      "rewards/accuracy_reward/std": 0.45867621898651123,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999873042106628,
+      "sampling/importance_sampling_ratio/min": 0.0010016229934990406,
+      "sampling/sampling_logp_difference/max": 6.906133651733398,
+      "sampling/sampling_logp_difference/mean": 0.018068701028823853,
+      "step": 279
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 4.1973035422415705e-05,
+      "clip_ratio/low_min": 6.267234766710317e-06,
+      "clip_ratio/region_mean": 4.1973035422415705e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16350.0,
+      "completions/mean_length": 7693.984375,
+      "completions/mean_terminated_length": 7556.0478515625,
+      "completions/min_length": 1349.0,
+      "completions/min_terminated_length": 1349.0,
+      "entropy": 0.7832933664321899,
+      "epoch": 0.2575896964121435,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0016663498245179653,
+      "learning_rate": 1e-05,
+      "loss": 0.0836,
+      "num_tokens": 226073822.0,
+      "reward": 0.421875,
+      "reward_std": 0.3227166533470154,
+      "rewards/accuracy_reward/mean": 0.421875,
+      "rewards/accuracy_reward/std": 0.4957992732524872,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999273419380188,
+      "sampling/importance_sampling_ratio/min": 5.893720299354754e-06,
+      "sampling/sampling_logp_difference/max": 12.04162311553955,
+      "sampling/sampling_logp_difference/mean": 0.01851016655564308,
+      "step": 280
+    },
+    {
+      "clip_ratio/high_max": 1.304801662627142e-05,
+      "clip_ratio/high_mean": 3.262004156567855e-06,
+      "clip_ratio/low_mean": 3.7096169648975774e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.035817426029098e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15207.0,
+      "completions/mean_length": 6222.4609375,
+      "completions/mean_terminated_length": 6061.1669921875,
+      "completions/min_length": 967.0,
+      "completions/min_terminated_length": 967.0,
+      "entropy": 0.8835120126605034,
+      "epoch": 0.25850965961361544,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0021045261528342962,
+      "learning_rate": 1e-05,
+      "loss": 0.055,
+      "num_tokens": 226888577.0,
+      "reward": 0.5078125,
+      "reward_std": 0.2767002284526825,
+      "rewards/accuracy_reward/mean": 0.5078125,
+      "rewards/accuracy_reward/std": 0.5019033551216125,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999616742134094,
+      "sampling/importance_sampling_ratio/min": 5.688065698450373e-07,
+      "sampling/sampling_logp_difference/max": 14.379725456237793,
+      "sampling/sampling_logp_difference/mean": 0.018851105123758316,
+      "step": 281
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 3.1754828114571865e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.1754828114571865e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16331.0,
+      "completions/mean_length": 6371.625,
+      "completions/mean_terminated_length": 6131.328125,
+      "completions/min_length": 1034.0,
+      "completions/min_terminated_length": 1034.0,
+      "entropy": 0.9026313945651054,
+      "epoch": 0.2594296228150874,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0030448357574641705,
+      "learning_rate": 1e-05,
+      "loss": 0.1009,
+      "num_tokens": 227722025.0,
+      "reward": 0.515625,
+      "reward_std": 0.2722293734550476,
+      "rewards/accuracy_reward/mean": 0.515625,
+      "rewards/accuracy_reward/std": 0.5017194747924805,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999712705612183,
+      "sampling/importance_sampling_ratio/min": 0.00016869648243300617,
+      "sampling/sampling_logp_difference/max": 8.687409400939941,
+      "sampling/sampling_logp_difference/mean": 0.018757576122879982,
+      "step": 282
+    },
+    {
+      "clip_ratio/high_max": 7.024085562079563e-06,
+      "clip_ratio/high_mean": 1.7560213905198907e-06,
+      "clip_ratio/low_mean": 3.379111592494155e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.5547137599678535e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15891.0,
+      "completions/mean_length": 7510.4921875,
+      "completions/mean_terminated_length": 7224.25,
+      "completions/min_length": 159.0,
+      "completions/min_terminated_length": 159.0,
+      "entropy": 1.044313833117485,
+      "epoch": 0.26034958601655933,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.0019295766251161695,
+      "learning_rate": 1e-05,
+      "loss": 0.0513,
+      "num_tokens": 228703256.0,
+      "reward": 0.3046875,
+      "reward_std": 0.19674429297447205,
+      "rewards/accuracy_reward/mean": 0.3046875,
+      "rewards/accuracy_reward/std": 0.46208351850509644,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999972581863403,
+      "sampling/importance_sampling_ratio/min": 0.0002186153142247349,
+      "sampling/sampling_logp_difference/max": 8.428196907043457,
+      "sampling/sampling_logp_difference/mean": 0.02207346074283123,
+      "step": 283
+    },
+    {
+      "clip_ratio/high_max": 5.068321115686558e-06,
+      "clip_ratio/high_mean": 1.2670802789216395e-06,
+      "clip_ratio/low_mean": 3.7797102550030104e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.9064182828951743e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16007.0,
+      "completions/mean_length": 7594.140625,
+      "completions/mean_terminated_length": 7524.92919921875,
+      "completions/min_length": 598.0,
+      "completions/min_terminated_length": 598.0,
+      "entropy": 0.9706612005829811,
+      "epoch": 0.2612695492180313,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.0017117204843088984,
+      "learning_rate": 1e-05,
+      "loss": 0.0748,
+      "num_tokens": 229697002.0,
+      "reward": 0.2734375,
+      "reward_std": 0.18649455904960632,
+      "rewards/accuracy_reward/mean": 0.2734375,
+      "rewards/accuracy_reward/std": 0.447474867105484,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.000016212463379,
+      "sampling/importance_sampling_ratio/min": 0.00035400164779275656,
+      "sampling/sampling_logp_difference/max": 7.946208953857422,
+      "sampling/sampling_logp_difference/mean": 0.021097885444760323,
+      "step": 284
+    },
+    {
+      "clip_ratio/high_max": 1.5618601537426002e-05,
+      "clip_ratio/high_mean": 3.904650384356501e-06,
+      "clip_ratio/low_mean": 4.570582996166195e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.961048034601845e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15562.0,
+      "completions/mean_length": 6888.9140625,
+      "completions/mean_terminated_length": 6738.19873046875,
+      "completions/min_length": 327.0,
+      "completions/min_terminated_length": 327.0,
+      "entropy": 0.9210037142038345,
+      "epoch": 0.2621895124195032,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0025933689903467894,
+      "learning_rate": 1e-05,
+      "loss": 0.0887,
+      "num_tokens": 230598679.0,
+      "reward": 0.4375,
+      "reward_std": 0.2580229938030243,
+      "rewards/accuracy_reward/mean": 0.4375,
+      "rewards/accuracy_reward/std": 0.49802759289741516,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999586939811707,
+      "sampling/importance_sampling_ratio/min": 0.0007308972999453545,
+      "sampling/sampling_logp_difference/max": 7.221237659454346,
+      "sampling/sampling_logp_difference/mean": 0.01939917542040348,
+      "step": 285
+    },
+    {
+      "clip_ratio/high_max": 2.398964193162101e-05,
+      "clip_ratio/high_mean": 6.9283565835576155e-06,
+      "clip_ratio/low_mean": 4.821338916372042e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.514174608833855e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15458.0,
+      "completions/mean_length": 6433.640625,
+      "completions/mean_terminated_length": 6355.29150390625,
+      "completions/min_length": 213.0,
+      "completions/min_terminated_length": 213.0,
+      "entropy": 1.064419962465763,
+      "epoch": 0.26310947562097514,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.0019397985888645053,
+      "learning_rate": 1e-05,
+      "loss": 0.0841,
+      "num_tokens": 231440153.0,
+      "reward": 0.375,
+      "reward_std": 0.3451131582260132,
+      "rewards/accuracy_reward/mean": 0.375,
+      "rewards/accuracy_reward/std": 0.4860251843929291,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999503493309021,
+      "sampling/importance_sampling_ratio/min": 0.019039930775761604,
+      "sampling/sampling_logp_difference/max": 3.961216926574707,
+      "sampling/sampling_logp_difference/mean": 0.021084938198328018,
+      "step": 286
+    },
+    {
+      "clip_ratio/high_max": 1.9223051822336856e-05,
+      "clip_ratio/high_mean": 6.997284344834043e-06,
+      "clip_ratio/low_mean": 5.4512621773028513e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 6.150990611786256e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14304.0,
+      "completions/mean_length": 5858.40625,
+      "completions/mean_terminated_length": 5691.33349609375,
+      "completions/min_length": 546.0,
+      "completions/min_terminated_length": 546.0,
+      "entropy": 0.8120778575539589,
+      "epoch": 0.2640294388224471,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.002288782736286521,
+      "learning_rate": 1e-05,
+      "loss": 0.0408,
+      "num_tokens": 232209485.0,
+      "reward": 0.46875,
+      "reward_std": 0.36637401580810547,
+      "rewards/accuracy_reward/mean": 0.46875,
+      "rewards/accuracy_reward/std": 0.5009832978248596,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999239444732666,
+      "sampling/importance_sampling_ratio/min": 0.00017959839897230268,
+      "sampling/sampling_logp_difference/max": 8.624787330627441,
+      "sampling/sampling_logp_difference/mean": 0.019076552242040634,
+      "step": 287
+    },
+    {
+      "clip_ratio/high_max": 9.900939403451048e-06,
+      "clip_ratio/high_mean": 3.4680233511608094e-06,
+      "clip_ratio/low_mean": 1.8137742017643177e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.1605765368803986e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15825.0,
+      "completions/mean_length": 7088.4765625,
+      "completions/mean_terminated_length": 6710.609375,
+      "completions/min_length": 688.0,
+      "completions/min_terminated_length": 688.0,
+      "entropy": 0.9231890514492989,
+      "epoch": 0.26494940202391903,
+      "frac_reward_zero_std": 0.625,
+      "grad_norm": 0.001075367210432887,
+      "learning_rate": 1e-05,
+      "loss": 0.0364,
+      "num_tokens": 233133850.0,
+      "reward": 0.5078125,
+      "reward_std": 0.18383610248565674,
+      "rewards/accuracy_reward/mean": 0.5078125,
+      "rewards/accuracy_reward/std": 0.5019033551216125,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998681545257568,
+      "sampling/importance_sampling_ratio/min": 0.005257915705442429,
+      "sampling/sampling_logp_difference/max": 5.248020648956299,
+      "sampling/sampling_logp_difference/mean": 0.019140273332595825,
+      "step": 288
+    },
+    {
+      "clip_ratio/high_max": 8.648456969240215e-06,
+      "clip_ratio/high_mean": 2.1621142423100537e-06,
+      "clip_ratio/low_mean": 1.838804723774956e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.0550161480059614e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16384.0,
+      "completions/mean_length": 6151.78125,
+      "completions/mean_terminated_length": 5906.20849609375,
+      "completions/min_length": 772.0,
+      "completions/min_terminated_length": 772.0,
+      "entropy": 0.8585417941212654,
+      "epoch": 0.265869365225391,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0015517349820584059,
+      "learning_rate": 1e-05,
+      "loss": 0.0828,
+      "num_tokens": 233940718.0,
+      "reward": 0.46875,
+      "reward_std": 0.21884137392044067,
+      "rewards/accuracy_reward/mean": 0.46875,
+      "rewards/accuracy_reward/std": 0.5009832978248596,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000255107879639,
+      "sampling/importance_sampling_ratio/min": 7.617311348440126e-05,
+      "sampling/sampling_logp_difference/max": 9.482501983642578,
+      "sampling/sampling_logp_difference/mean": 0.019276250153779984,
+      "step": 289
+    },
+    {
+      "clip_ratio/high_max": 1.1416668485253467e-05,
+      "clip_ratio/high_mean": 3.7661499732166703e-06,
+      "clip_ratio/low_mean": 2.1342358195397537e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.5108507770710276e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15133.0,
+      "completions/mean_length": 7111.2578125,
+      "completions/mean_terminated_length": 6812.13671875,
+      "completions/min_length": 35.0,
+      "completions/min_terminated_length": 35.0,
+      "entropy": 0.9735362678766251,
+      "epoch": 0.2667893284268629,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0036829947493970394,
+      "learning_rate": 1e-05,
+      "loss": 0.0642,
+      "num_tokens": 234872111.0,
+      "reward": 0.4296875,
+      "reward_std": 0.31930169463157654,
+      "rewards/accuracy_reward/mean": 0.4296875,
+      "rewards/accuracy_reward/std": 0.4969765841960907,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999943971633911,
+      "sampling/importance_sampling_ratio/min": 0.0006535807042382658,
+      "sampling/sampling_logp_difference/max": 7.333044528961182,
+      "sampling/sampling_logp_difference/mean": 0.021356046199798584,
+      "step": 290
+    },
+    {
+      "clip_ratio/high_max": 2.2526005068357335e-05,
+      "clip_ratio/high_mean": 5.631501267089334e-06,
+      "clip_ratio/low_mean": 3.30086276107977e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.864012808207917e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15995.0,
+      "completions/mean_length": 6787.671875,
+      "completions/mean_terminated_length": 6478.11279296875,
+      "completions/min_length": 1404.0,
+      "completions/min_terminated_length": 1404.0,
+      "entropy": 0.8856986835598946,
+      "epoch": 0.26770929162833484,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.00234629912301898,
+      "learning_rate": 1e-05,
+      "loss": 0.0169,
+      "num_tokens": 235759149.0,
+      "reward": 0.5390625,
+      "reward_std": 0.25354230403900146,
+      "rewards/accuracy_reward/mean": 0.5390625,
+      "rewards/accuracy_reward/std": 0.5004304051399231,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999296069145203,
+      "sampling/importance_sampling_ratio/min": 0.00035710117663256824,
+      "sampling/sampling_logp_difference/max": 7.937491416931152,
+      "sampling/sampling_logp_difference/mean": 0.01950475014746189,
+      "step": 291
+    },
+    {
+      "clip_ratio/high_max": 2.6025282068076194e-05,
+      "clip_ratio/high_mean": 6.5063205170190486e-06,
+      "clip_ratio/low_mean": 4.603358706845029e-05,
+      "clip_ratio/low_min": 4.53654638477019e-06,
+      "clip_ratio/region_mean": 5.253990843812062e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15845.0,
+      "completions/mean_length": 6757.203125,
+      "completions/mean_terminated_length": 6604.39697265625,
+      "completions/min_length": 236.0,
+      "completions/min_terminated_length": 236.0,
+      "entropy": 0.9217840805649757,
+      "epoch": 0.2686292548298068,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0034125701058655977,
+      "learning_rate": 1e-05,
+      "loss": 0.0527,
+      "num_tokens": 236643319.0,
+      "reward": 0.3515625,
+      "reward_std": 0.2896084189414978,
+      "rewards/accuracy_reward/mean": 0.3515625,
+      "rewards/accuracy_reward/std": 0.4793342351913452,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999240636825562,
+      "sampling/importance_sampling_ratio/min": 6.144329745438881e-06,
+      "sampling/sampling_logp_difference/max": 11.999980926513672,
+      "sampling/sampling_logp_difference/mean": 0.020774487406015396,
+      "step": 292
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 3.5210429246035346e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.5210429246035346e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16319.0,
+      "completions/mean_length": 6504.4375,
+      "completions/mean_terminated_length": 6185.74169921875,
+      "completions/min_length": 516.0,
+      "completions/min_terminated_length": 516.0,
+      "entropy": 1.126970261335373,
+      "epoch": 0.26954921803127874,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0020905097480863333,
+      "learning_rate": 1e-05,
+      "loss": 0.0464,
+      "num_tokens": 237495351.0,
+      "reward": 0.25,
+      "reward_std": 0.30904704332351685,
+      "rewards/accuracy_reward/mean": 0.25,
+      "rewards/accuracy_reward/std": 0.434714138507843,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000053644180298,
+      "sampling/importance_sampling_ratio/min": 0.0009940610034391284,
+      "sampling/sampling_logp_difference/max": 6.913712024688721,
+      "sampling/sampling_logp_difference/mean": 0.023218728601932526,
+      "step": 293
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 1.5693222053414502e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 1.5693222053414502e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15888.0,
+      "completions/mean_length": 5702.4140625,
+      "completions/mean_terminated_length": 5446.05615234375,
+      "completions/min_length": 867.0,
+      "completions/min_terminated_length": 867.0,
+      "entropy": 0.8803137242794037,
+      "epoch": 0.2704691812327507,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.002073790645226836,
+      "learning_rate": 1e-05,
+      "loss": 0.0066,
+      "num_tokens": 238251852.0,
+      "reward": 0.5625,
+      "reward_std": 0.2022808939218521,
+      "rewards/accuracy_reward/mean": 0.5625,
+      "rewards/accuracy_reward/std": 0.49802759289741516,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.000054955482483,
+      "sampling/importance_sampling_ratio/min": 0.016290459781885147,
+      "sampling/sampling_logp_difference/max": 4.117175579071045,
+      "sampling/sampling_logp_difference/mean": 0.0185186006128788,
+      "step": 294
+    },
+    {
+      "clip_ratio/high_max": 1.4213665508577833e-05,
+      "clip_ratio/high_mean": 4.4483959982244414e-06,
+      "clip_ratio/low_mean": 2.979715202400257e-05,
+      "clip_ratio/low_min": 4.1597336348786484e-06,
+      "clip_ratio/region_mean": 3.424554824960069e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15915.0,
+      "completions/mean_length": 7176.2890625,
+      "completions/mean_terminated_length": 6801.99169921875,
+      "completions/min_length": 654.0,
+      "completions/min_terminated_length": 654.0,
+      "entropy": 0.9554997384548187,
+      "epoch": 0.27138914443422263,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.002781527815386653,
+      "learning_rate": 1e-05,
+      "loss": 0.0908,
+      "num_tokens": 239189385.0,
+      "reward": 0.5078125,
+      "reward_std": 0.3634958863258362,
+      "rewards/accuracy_reward/mean": 0.5078125,
+      "rewards/accuracy_reward/std": 0.5019033551216125,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999794960021973,
+      "sampling/importance_sampling_ratio/min": 0.0018711343873292208,
+      "sampling/sampling_logp_difference/max": 6.281210422515869,
+      "sampling/sampling_logp_difference/mean": 0.020436719059944153,
+      "step": 295
+    },
+    {
+      "clip_ratio/high_max": 1.2612186310434481e-05,
+      "clip_ratio/high_mean": 5.171368570699997e-06,
+      "clip_ratio/low_mean": 4.8968343890010146e-05,
+      "clip_ratio/low_min": 4.0222671486844774e-06,
+      "clip_ratio/region_mean": 5.413971166490228e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16020.0,
+      "completions/mean_length": 7855.578125,
+      "completions/mean_terminated_length": 7651.2001953125,
+      "completions/min_length": 688.0,
+      "completions/min_terminated_length": 688.0,
+      "entropy": 0.9450526610016823,
+      "epoch": 0.27230910763569455,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.003132987068966031,
+      "learning_rate": 1e-05,
+      "loss": 0.0311,
+      "num_tokens": 240217715.0,
+      "reward": 0.40625,
+      "reward_std": 0.28512775897979736,
+      "rewards/accuracy_reward/mean": 0.40625,
+      "rewards/accuracy_reward/std": 0.4930621087551117,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999253153800964,
+      "sampling/importance_sampling_ratio/min": 0.0011438478250056505,
+      "sampling/sampling_logp_difference/max": 6.773357391357422,
+      "sampling/sampling_logp_difference/mean": 0.021461743861436844,
+      "step": 296
+    },
+    {
+      "clip_ratio/high_max": 2.172341964978841e-05,
+      "clip_ratio/high_mean": 6.823271291978017e-06,
+      "clip_ratio/low_mean": 3.516899266742257e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.199226441414794e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14060.0,
+      "completions/mean_length": 6240.265625,
+      "completions/mean_terminated_length": 5913.04833984375,
+      "completions/min_length": 421.0,
+      "completions/min_terminated_length": 421.0,
+      "entropy": 0.8811023011803627,
+      "epoch": 0.2732290708371665,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0028523094952106476,
+      "learning_rate": 1e-05,
+      "loss": 0.015,
+      "num_tokens": 241035133.0,
+      "reward": 0.484375,
+      "reward_std": 0.26143303513526917,
+      "rewards/accuracy_reward/mean": 0.484375,
+      "rewards/accuracy_reward/std": 0.5017194747924805,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.000007152557373,
+      "sampling/importance_sampling_ratio/min": 0.0006931954412721097,
+      "sampling/sampling_logp_difference/max": 7.274198532104492,
+      "sampling/sampling_logp_difference/mean": 0.019493088126182556,
+      "step": 297
+    },
+    {
+      "clip_ratio/high_max": 1.2606601558218244e-05,
+      "clip_ratio/high_mean": 3.151650389554561e-06,
+      "clip_ratio/low_mean": 3.768150395444536e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.08331545713736e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15795.0,
+      "completions/mean_length": 6103.203125,
+      "completions/mean_terminated_length": 6022.251953125,
+      "completions/min_length": 355.0,
+      "completions/min_terminated_length": 355.0,
+      "entropy": 0.8766692876815796,
+      "epoch": 0.27414903403863844,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0026241440791636705,
+      "learning_rate": 1e-05,
+      "loss": 0.0089,
+      "num_tokens": 241836479.0,
+      "reward": 0.453125,
+      "reward_std": 0.32589423656463623,
+      "rewards/accuracy_reward/mean": 0.453125,
+      "rewards/accuracy_reward/std": 0.4997538626194,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999925434589386,
+      "sampling/importance_sampling_ratio/min": 0.00012664205860346556,
+      "sampling/sampling_logp_difference/max": 8.974145889282227,
+      "sampling/sampling_logp_difference/mean": 0.01907728984951973,
+      "step": 298
+    },
+    {
+      "clip_ratio/high_max": 1.7400974911652156e-05,
+      "clip_ratio/high_mean": 4.350243727913039e-06,
+      "clip_ratio/low_mean": 4.527119426711579e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.962143839293276e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16169.0,
+      "completions/mean_length": 7711.0703125,
+      "completions/mean_terminated_length": 7573.4052734375,
+      "completions/min_length": 290.0,
+      "completions/min_terminated_length": 290.0,
+      "entropy": 1.0770929008722305,
+      "epoch": 0.2750689972401104,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.003654222236946225,
+      "learning_rate": 1e-05,
+      "loss": 0.0443,
+      "num_tokens": 242844376.0,
+      "reward": 0.3359375,
+      "reward_std": 0.2501322627067566,
+      "rewards/accuracy_reward/mean": 0.3359375,
+      "rewards/accuracy_reward/std": 0.47417303919792175,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999839067459106,
+      "sampling/importance_sampling_ratio/min": 0.0006267472635954618,
+      "sampling/sampling_logp_difference/max": 7.374967098236084,
+      "sampling/sampling_logp_difference/mean": 0.022012868896126747,
+      "step": 299
+    },
+    {
+      "clip_ratio/high_max": 1.4325163647299632e-05,
+      "clip_ratio/high_mean": 3.581290911824908e-06,
+      "clip_ratio/low_mean": 4.28195745598714e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.6400865016948956e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15905.0,
+      "completions/mean_length": 6616.5546875,
+      "completions/mean_terminated_length": 6539.6455078125,
+      "completions/min_length": 138.0,
+      "completions/min_terminated_length": 138.0,
+      "entropy": 0.8439916148781776,
+      "epoch": 0.27598896044158233,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0029195898678153753,
+      "learning_rate": 1e-05,
+      "loss": 0.1094,
+      "num_tokens": 243708479.0,
+      "reward": 0.453125,
+      "reward_std": 0.3516485095024109,
+      "rewards/accuracy_reward/mean": 0.453125,
+      "rewards/accuracy_reward/std": 0.4997538626194,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998997449874878,
+      "sampling/importance_sampling_ratio/min": 2.189194128732197e-05,
+      "sampling/sampling_logp_difference/max": 10.729392051696777,
+      "sampling/sampling_logp_difference/mean": 0.017992788925766945,
+      "step": 300
+    },
+    {
+      "clip_ratio/high_max": 1.848296233220026e-05,
+      "clip_ratio/high_mean": 4.620740583050065e-06,
+      "clip_ratio/low_mean": 5.01860952226707e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.480683557834709e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15286.0,
+      "completions/mean_length": 6173.5234375,
+      "completions/mean_terminated_length": 6093.1259765625,
+      "completions/min_length": 21.0,
+      "completions/min_terminated_length": 21.0,
+      "entropy": 0.8975192531943321,
+      "epoch": 0.2769089236430543,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.0017261393368244171,
+      "learning_rate": 1e-05,
+      "loss": 0.0853,
+      "num_tokens": 244515378.0,
+      "reward": 0.53125,
+      "reward_std": 0.3532412052154541,
+      "rewards/accuracy_reward/mean": 0.53125,
+      "rewards/accuracy_reward/std": 0.5009832978248596,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999533891677856,
+      "sampling/importance_sampling_ratio/min": 0.000553854217287153,
+      "sampling/sampling_logp_difference/max": 7.4986090660095215,
+      "sampling/sampling_logp_difference/mean": 0.019458644092082977,
+      "step": 301
+    },
+    {
+      "clip_ratio/high_max": 4.114005332667148e-05,
+      "clip_ratio/high_mean": 1.2276760230633954e-05,
+      "clip_ratio/low_mean": 3.397437080820964e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.625113024303573e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16032.0,
+      "completions/mean_length": 5640.90625,
+      "completions/mean_terminated_length": 5470.38134765625,
+      "completions/min_length": 359.0,
+      "completions/min_terminated_length": 359.0,
+      "entropy": 0.8833519890904427,
+      "epoch": 0.2778288868445262,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0018768958980217576,
+      "learning_rate": 1e-05,
+      "loss": 0.0731,
+      "num_tokens": 245258318.0,
+      "reward": 0.4609375,
+      "reward_std": 0.3135277330875397,
+      "rewards/accuracy_reward/mean": 0.4609375,
+      "rewards/accuracy_reward/std": 0.5004304051399231,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999516606330872,
+      "sampling/importance_sampling_ratio/min": 0.0071789538487792015,
+      "sampling/sampling_logp_difference/max": 4.936601638793945,
+      "sampling/sampling_logp_difference/mean": 0.019646335393190384,
+      "step": 302
+    },
+    {
+      "clip_ratio/high_max": 1.4196921938491869e-05,
+      "clip_ratio/high_mean": 4.514302474944998e-06,
+      "clip_ratio/low_mean": 4.4677519781544106e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.919182129015098e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16378.0,
+      "completions/mean_length": 7840.5078125,
+      "completions/mean_terminated_length": 7564.9111328125,
+      "completions/min_length": 758.0,
+      "completions/min_terminated_length": 758.0,
+      "entropy": 0.9772802665829659,
+      "epoch": 0.27874885004599814,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.002617602702230215,
+      "learning_rate": 1e-05,
+      "loss": 0.0298,
+      "num_tokens": 246280663.0,
+      "reward": 0.328125,
+      "reward_std": 0.29826050996780396,
+      "rewards/accuracy_reward/mean": 0.328125,
+      "rewards/accuracy_reward/std": 0.4713755249977112,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999324083328247,
+      "sampling/importance_sampling_ratio/min": 0.0008982301224023104,
+      "sampling/sampling_logp_difference/max": 7.015084266662598,
+      "sampling/sampling_logp_difference/mean": 0.022171074524521828,
+      "step": 303
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 1.7621316146687604e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 1.7621316146687604e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16326.0,
+      "completions/mean_length": 6316.1015625,
+      "completions/mean_terminated_length": 6074.47216796875,
+      "completions/min_length": 779.0,
+      "completions/min_terminated_length": 779.0,
+      "entropy": 0.8542795851826668,
+      "epoch": 0.2796688132474701,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0011874900665134192,
+      "learning_rate": 1e-05,
+      "loss": 0.0513,
+      "num_tokens": 247107604.0,
+      "reward": 0.3828125,
+      "reward_std": 0.2227931022644043,
+      "rewards/accuracy_reward/mean": 0.3828125,
+      "rewards/accuracy_reward/std": 0.4879830479621887,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000126361846924,
+      "sampling/importance_sampling_ratio/min": 0.00015846268797758967,
+      "sampling/sampling_logp_difference/max": 8.749991416931152,
+      "sampling/sampling_logp_difference/mean": 0.018691308796405792,
+      "step": 304
+    },
+    {
+      "clip_ratio/high_max": 3.0959752166381804e-06,
+      "clip_ratio/high_mean": 7.739938041595451e-07,
+      "clip_ratio/low_mean": 6.0967123090449604e-05,
+      "clip_ratio/low_min": 2.711407751121442e-05,
+      "clip_ratio/region_mean": 6.17411176335736e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15782.0,
+      "completions/mean_length": 6568.171875,
+      "completions/mean_terminated_length": 6412.365234375,
+      "completions/min_length": 97.0,
+      "completions/min_terminated_length": 97.0,
+      "entropy": 0.9063890501856804,
+      "epoch": 0.28058877644894203,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.002459619427099824,
+      "learning_rate": 1e-05,
+      "loss": 0.0725,
+      "num_tokens": 247967322.0,
+      "reward": 0.5,
+      "reward_std": 0.3214184641838074,
+      "rewards/accuracy_reward/mean": 0.5,
+      "rewards/accuracy_reward/std": 0.5019646286964417,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998743534088135,
+      "sampling/importance_sampling_ratio/min": 0.012350871227681637,
+      "sampling/sampling_logp_difference/max": 4.394028663635254,
+      "sampling/sampling_logp_difference/mean": 0.020134467631578445,
+      "step": 305
+    },
+    {
+      "clip_ratio/high_max": 5.9507838159333915e-06,
+      "clip_ratio/high_mean": 1.4876959539833479e-06,
+      "clip_ratio/low_mean": 2.400908408617397e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.549678004015732e-05,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15714.0,
+      "completions/mean_length": 8182.28125,
+      "completions/mean_terminated_length": 7635.50048828125,
+      "completions/min_length": 877.0,
+      "completions/min_terminated_length": 877.0,
+      "entropy": 1.0137704983353615,
+      "epoch": 0.281508739650414,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0016673406353220344,
+      "learning_rate": 1e-05,
+      "loss": 0.0244,
+      "num_tokens": 249031710.0,
+      "reward": 0.3359375,
+      "reward_std": 0.22225631773471832,
+      "rewards/accuracy_reward/mean": 0.3359375,
+      "rewards/accuracy_reward/std": 0.47417303919792175,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998995065689087,
+      "sampling/importance_sampling_ratio/min": 0.0008049134048633277,
+      "sampling/sampling_logp_difference/max": 7.1247758865356445,
+      "sampling/sampling_logp_difference/mean": 0.021704845130443573,
+      "step": 306
+    },
+    {
+      "clip_ratio/high_max": 1.4527202438330278e-05,
+      "clip_ratio/high_mean": 3.6318006095825695e-06,
+      "clip_ratio/low_mean": 3.1829216595724574e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.5461017205307144e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 14502.0,
+      "completions/max_terminated_length": 14502.0,
+      "completions/mean_length": 6460.5703125,
+      "completions/mean_terminated_length": 6460.5703125,
+      "completions/min_length": 804.0,
+      "completions/min_terminated_length": 804.0,
+      "entropy": 1.0418165400624275,
+      "epoch": 0.2824287028518859,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0022682021372020245,
+      "learning_rate": 1e-05,
+      "loss": 0.0171,
+      "num_tokens": 249881047.0,
+      "reward": 0.359375,
+      "reward_std": 0.25566887855529785,
+      "rewards/accuracy_reward/mean": 0.359375,
+      "rewards/accuracy_reward/std": 0.481702595949173,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999744296073914,
+      "sampling/importance_sampling_ratio/min": 0.002809183904901147,
+      "sampling/sampling_logp_difference/max": 5.874861240386963,
+      "sampling/sampling_logp_difference/mean": 0.02204791083931923,
+      "step": 307
+    },
+    {
+      "clip_ratio/high_max": 9.222687367582694e-06,
+      "clip_ratio/high_mean": 4.125313353142701e-06,
+      "clip_ratio/low_mean": 4.836107154915226e-05,
+      "clip_ratio/low_min": 3.4611657611094415e-06,
+      "clip_ratio/region_mean": 5.248638444754761e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14754.0,
+      "completions/mean_length": 6846.3046875,
+      "completions/mean_terminated_length": 6694.9130859375,
+      "completions/min_length": 944.0,
+      "completions/min_terminated_length": 944.0,
+      "entropy": 0.9839218333363533,
+      "epoch": 0.28334866605335784,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.002436346374452114,
+      "learning_rate": 1e-05,
+      "loss": 0.0637,
+      "num_tokens": 250773806.0,
+      "reward": 0.484375,
+      "reward_std": 0.34299150109291077,
+      "rewards/accuracy_reward/mean": 0.484375,
+      "rewards/accuracy_reward/std": 0.5017194747924805,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999980628490448,
+      "sampling/importance_sampling_ratio/min": 0.0257408544421196,
+      "sampling/sampling_logp_difference/max": 3.6596758365631104,
+      "sampling/sampling_logp_difference/mean": 0.02135510742664337,
+      "step": 308
+    },
+    {
+      "clip_ratio/high_max": 1.3327621218195418e-05,
+      "clip_ratio/high_mean": 3.3319053045488545e-06,
+      "clip_ratio/low_mean": 3.791964286392613e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.1251548054788145e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15777.0,
+      "completions/mean_length": 6558.53125,
+      "completions/mean_terminated_length": 6241.58056640625,
+      "completions/min_length": 884.0,
+      "completions/min_terminated_length": 884.0,
+      "entropy": 0.7833076938986778,
+      "epoch": 0.2842686292548298,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.002015948062762618,
+      "learning_rate": 1e-05,
+      "loss": 0.0791,
+      "num_tokens": 251633074.0,
+      "reward": 0.46875,
+      "reward_std": 0.27328526973724365,
+      "rewards/accuracy_reward/mean": 0.46875,
+      "rewards/accuracy_reward/std": 0.5009832978248596,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999434947967529,
+      "sampling/importance_sampling_ratio/min": 5.1445105782477185e-05,
+      "sampling/sampling_logp_difference/max": 9.874995231628418,
+      "sampling/sampling_logp_difference/mean": 0.017078280448913574,
+      "step": 309
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 3.3865982686620555e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.3865982686620555e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16162.0,
+      "completions/mean_length": 7626.390625,
+      "completions/mean_terminated_length": 7487.38134765625,
+      "completions/min_length": 1400.0,
+      "completions/min_terminated_length": 1400.0,
+      "entropy": 0.8946382254362106,
+      "epoch": 0.28518859245630174,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.001098336186259985,
+      "learning_rate": 1e-05,
+      "loss": 0.042,
+      "num_tokens": 252629300.0,
+      "reward": 0.3359375,
+      "reward_std": 0.27222445607185364,
+      "rewards/accuracy_reward/mean": 0.3359375,
+      "rewards/accuracy_reward/std": 0.47417303919792175,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000107288360596,
+      "sampling/importance_sampling_ratio/min": 0.00021643216314259917,
+      "sampling/sampling_logp_difference/max": 8.438233375549316,
+      "sampling/sampling_logp_difference/mean": 0.01972624473273754,
+      "step": 310
+    },
+    {
+      "clip_ratio/high_max": 6.5777783220255515e-06,
+      "clip_ratio/high_mean": 1.6444445805063879e-06,
+      "clip_ratio/low_mean": 1.7658890669736138e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 1.9303335250242526e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15763.0,
+      "completions/mean_length": 5796.984375,
+      "completions/mean_terminated_length": 5713.6220703125,
+      "completions/min_length": 528.0,
+      "completions/min_terminated_length": 528.0,
+      "entropy": 0.969724528491497,
+      "epoch": 0.2861085556577737,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.003871417138725519,
+      "learning_rate": 1e-05,
+      "loss": 0.0408,
+      "num_tokens": 253389562.0,
+      "reward": 0.484375,
+      "reward_std": 0.23752351105213165,
+      "rewards/accuracy_reward/mean": 0.484375,
+      "rewards/accuracy_reward/std": 0.5017194747924805,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998880624771118,
+      "sampling/importance_sampling_ratio/min": 2.4301782104885206e-05,
+      "sampling/sampling_logp_difference/max": 10.624960899353027,
+      "sampling/sampling_logp_difference/mean": 0.019220752641558647,
+      "step": 311
+    },
+    {
+      "clip_ratio/high_max": 8.099077376755304e-06,
+      "clip_ratio/high_mean": 2.8300572125772305e-06,
+      "clip_ratio/low_mean": 3.2033483023496956e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.486354006554393e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15769.0,
+      "completions/mean_length": 6938.5625,
+      "completions/mean_terminated_length": 6788.63525390625,
+      "completions/min_length": 769.0,
+      "completions/min_terminated_length": 769.0,
+      "entropy": 0.9812447279691696,
+      "epoch": 0.28702851885924563,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.002257548039779067,
+      "learning_rate": 1e-05,
+      "loss": -0.0089,
+      "num_tokens": 254295858.0,
+      "reward": 0.4140625,
+      "reward_std": 0.2596206068992615,
+      "rewards/accuracy_reward/mean": 0.4140625,
+      "rewards/accuracy_reward/std": 0.49449479579925537,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000464916229248,
+      "sampling/importance_sampling_ratio/min": 0.0009388317703269422,
+      "sampling/sampling_logp_difference/max": 6.970874309539795,
+      "sampling/sampling_logp_difference/mean": 0.02080199122428894,
+      "step": 312
+    },
+    {
+      "clip_ratio/high_max": 4.441917553776875e-06,
+      "clip_ratio/high_mean": 1.1104793884442188e-06,
+      "clip_ratio/low_mean": 3.414505465570983e-05,
+      "clip_ratio/low_min": 3.790060873143375e-06,
+      "clip_ratio/region_mean": 3.5255534044154047e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15163.0,
+      "completions/mean_length": 6878.15625,
+      "completions/mean_terminated_length": 6650.01611328125,
+      "completions/min_length": 302.0,
+      "completions/min_terminated_length": 302.0,
+      "entropy": 0.9106859937310219,
+      "epoch": 0.28794848206071755,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.00420041661709547,
+      "learning_rate": 1e-05,
+      "loss": 0.0179,
+      "num_tokens": 255197110.0,
+      "reward": 0.421875,
+      "reward_std": 0.30433881282806396,
+      "rewards/accuracy_reward/mean": 0.421875,
+      "rewards/accuracy_reward/std": 0.4957992732524872,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999925494194031,
+      "sampling/importance_sampling_ratio/min": 0.015217061154544353,
+      "sampling/sampling_logp_difference/max": 4.185338020324707,
+      "sampling/sampling_logp_difference/mean": 0.02016574889421463,
+      "step": 313
+    },
+    {
+      "clip_ratio/high_max": 8.814751254249131e-06,
+      "clip_ratio/high_mean": 2.203687813562283e-06,
+      "clip_ratio/low_mean": 3.137724206681014e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.3580929766685585e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 14462.0,
+      "completions/max_terminated_length": 14462.0,
+      "completions/mean_length": 6260.2578125,
+      "completions/mean_terminated_length": 6260.2578125,
+      "completions/min_length": 790.0,
+      "completions/min_terminated_length": 790.0,
+      "entropy": 0.9523455575108528,
+      "epoch": 0.2888684452621895,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0027907798066735268,
+      "learning_rate": 1e-05,
+      "loss": 0.0302,
+      "num_tokens": 256018935.0,
+      "reward": 0.421875,
+      "reward_std": 0.2659186124801636,
+      "rewards/accuracy_reward/mean": 0.421875,
+      "rewards/accuracy_reward/std": 0.4957992732524872,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000364780426025,
+      "sampling/importance_sampling_ratio/min": 7.485197420464829e-05,
+      "sampling/sampling_logp_difference/max": 9.499998092651367,
+      "sampling/sampling_logp_difference/mean": 0.0191945917904377,
+      "step": 314
+    },
+    {
+      "clip_ratio/high_max": 2.8685263259831117e-05,
+      "clip_ratio/high_mean": 7.171315814957779e-06,
+      "clip_ratio/low_mean": 2.780131131885355e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.497262770224552e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16124.0,
+      "completions/mean_length": 6202.828125,
+      "completions/mean_terminated_length": 6041.22265625,
+      "completions/min_length": 453.0,
+      "completions/min_terminated_length": 453.0,
+      "entropy": 0.8513326346874237,
+      "epoch": 0.28978840846366144,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0023744129575788975,
+      "learning_rate": 1e-05,
+      "loss": 0.0379,
+      "num_tokens": 256841129.0,
+      "reward": 0.5625,
+      "reward_std": 0.32407689094543457,
+      "rewards/accuracy_reward/mean": 0.5625,
+      "rewards/accuracy_reward/std": 0.49802759289741516,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000146627426147,
+      "sampling/importance_sampling_ratio/min": 9.269781003240496e-06,
+      "sampling/sampling_logp_difference/max": 11.588750839233398,
+      "sampling/sampling_logp_difference/mean": 0.019519174471497536,
+      "step": 315
+    },
+    {
+      "clip_ratio/high_max": 1.6381697605538648e-05,
+      "clip_ratio/high_mean": 4.095424401384662e-06,
+      "clip_ratio/low_mean": 3.0394592840821133e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.449001792432682e-05,
+      "completions/clipped_ratio": 0.1015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16328.0,
+      "completions/mean_length": 8019.4609375,
+      "completions/mean_terminated_length": 7073.90380859375,
+      "completions/min_length": 262.0,
+      "completions/min_terminated_length": 262.0,
+      "entropy": 0.9211000874638557,
+      "epoch": 0.2907083716651334,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0024705040268599987,
+      "learning_rate": 1e-05,
+      "loss": 0.0433,
+      "num_tokens": 257884188.0,
+      "reward": 0.3046875,
+      "reward_std": 0.2869499623775482,
+      "rewards/accuracy_reward/mean": 0.3046875,
+      "rewards/accuracy_reward/std": 0.46208351850509644,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999315738677979,
+      "sampling/importance_sampling_ratio/min": 0.016358470544219017,
+      "sampling/sampling_logp_difference/max": 4.113009452819824,
+      "sampling/sampling_logp_difference/mean": 0.01984308287501335,
+      "step": 316
+    },
+    {
+      "clip_ratio/high_max": 7.485402420570608e-06,
+      "clip_ratio/high_mean": 1.871350605142652e-06,
+      "clip_ratio/low_mean": 3.025547425750119e-05,
+      "clip_ratio/low_min": 2.697337095014518e-06,
+      "clip_ratio/region_mean": 3.212682509001752e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15210.0,
+      "completions/mean_length": 7257.6875,
+      "completions/mean_terminated_length": 7038.65625,
+      "completions/min_length": 248.0,
+      "completions/min_terminated_length": 248.0,
+      "entropy": 0.8801277950406075,
+      "epoch": 0.29162833486660533,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0032848953269422054,
+      "learning_rate": 1e-05,
+      "loss": 0.0305,
+      "num_tokens": 258831852.0,
+      "reward": 0.4296875,
+      "reward_std": 0.31010788679122925,
+      "rewards/accuracy_reward/mean": 0.4296875,
+      "rewards/accuracy_reward/std": 0.4969765841960907,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998986124992371,
+      "sampling/importance_sampling_ratio/min": 0.00019848966621793807,
+      "sampling/sampling_logp_difference/max": 8.524773597717285,
+      "sampling/sampling_logp_difference/mean": 0.019743187353014946,
+      "step": 317
+    },
+    {
+      "clip_ratio/high_max": 1.52771035573096e-05,
+      "clip_ratio/high_mean": 3.8192758893274e-06,
+      "clip_ratio/low_mean": 3.605492440783564e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.987420052453672e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14903.0,
+      "completions/mean_length": 6042.84375,
+      "completions/mean_terminated_length": 5878.69873046875,
+      "completions/min_length": 251.0,
+      "completions/min_terminated_length": 251.0,
+      "entropy": 0.8792382404208183,
+      "epoch": 0.29254829806807725,
+      "frac_reward_zero_std": 0.125,
+      "grad_norm": 0.004201764706522226,
+      "learning_rate": 1e-05,
+      "loss": 0.099,
+      "num_tokens": 259623512.0,
+      "reward": 0.640625,
+      "reward_std": 0.3913668990135193,
+      "rewards/accuracy_reward/mean": 0.640625,
+      "rewards/accuracy_reward/std": 0.481702595949173,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998612403869629,
+      "sampling/importance_sampling_ratio/min": 0.00027811730979010463,
+      "sampling/sampling_logp_difference/max": 8.187467575073242,
+      "sampling/sampling_logp_difference/mean": 0.018901977688074112,
+      "step": 318
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 4.1642084397608414e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.1642084397608414e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16381.0,
+      "completions/mean_length": 7667.6875,
+      "completions/mean_terminated_length": 7458.49658203125,
+      "completions/min_length": 4.0,
+      "completions/min_terminated_length": 4.0,
+      "entropy": 0.9096411988139153,
+      "epoch": 0.2934682612695492,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0014557713875547051,
+      "learning_rate": 1e-05,
+      "loss": 0.0383,
+      "num_tokens": 260623928.0,
+      "reward": 0.3515625,
+      "reward_std": 0.22726887464523315,
+      "rewards/accuracy_reward/mean": 0.3515625,
+      "rewards/accuracy_reward/std": 0.4793342351913452,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999264478683472,
+      "sampling/importance_sampling_ratio/min": 0.0002615761768538505,
+      "sampling/sampling_logp_difference/max": 8.248785018920898,
+      "sampling/sampling_logp_difference/mean": 0.01979639381170273,
+      "step": 319
+    },
+    {
+      "clip_ratio/high_max": 2.36019068324822e-05,
+      "clip_ratio/high_mean": 5.90047670812055e-06,
+      "clip_ratio/low_mean": 2.704614530557592e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.2946622809504333e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15514.0,
+      "completions/max_terminated_length": 15514.0,
+      "completions/mean_length": 6428.8203125,
+      "completions/mean_terminated_length": 6428.8203125,
+      "completions/min_length": 617.0,
+      "completions/min_terminated_length": 617.0,
+      "entropy": 0.9974069148302078,
+      "epoch": 0.29438822447102114,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0028210312593728304,
+      "learning_rate": 1e-05,
+      "loss": 0.0589,
+      "num_tokens": 261465625.0,
+      "reward": 0.46875,
+      "reward_std": 0.3169426918029785,
+      "rewards/accuracy_reward/mean": 0.46875,
+      "rewards/accuracy_reward/std": 0.5009832978248596,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000195503234863,
+      "sampling/importance_sampling_ratio/min": 0.001225265790708363,
+      "sampling/sampling_logp_difference/max": 6.704597473144531,
+      "sampling/sampling_logp_difference/mean": 0.021066997200250626,
+      "step": 320
+    },
+    {
+      "clip_ratio/high_max": 2.9634452857862925e-05,
+      "clip_ratio/high_mean": 7.408613214465731e-06,
+      "clip_ratio/low_mean": 3.7066520235384814e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.447513333616371e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15224.0,
+      "completions/mean_length": 5664.515625,
+      "completions/mean_terminated_length": 5580.1103515625,
+      "completions/min_length": 299.0,
+      "completions/min_terminated_length": 299.0,
+      "entropy": 0.9557281509041786,
+      "epoch": 0.2953081876724931,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0024263609666377306,
+      "learning_rate": 1e-05,
+      "loss": 0.0357,
+      "num_tokens": 262208475.0,
+      "reward": 0.4765625,
+      "reward_std": 0.26409637928009033,
+      "rewards/accuracy_reward/mean": 0.4765625,
+      "rewards/accuracy_reward/std": 0.5014128684997559,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998950958251953,
+      "sampling/importance_sampling_ratio/min": 0.0001059407222783193,
+      "sampling/sampling_logp_difference/max": 9.152630805969238,
+      "sampling/sampling_logp_difference/mean": 0.01997508481144905,
+      "step": 321
+    },
+    {
+      "clip_ratio/high_max": 1.9527269159880234e-05,
+      "clip_ratio/high_mean": 5.685056066795369e-06,
+      "clip_ratio/low_mean": 4.980480150607036e-05,
+      "clip_ratio/low_min": 5.136423624207964e-06,
+      "clip_ratio/region_mean": 5.5489856435997353e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15566.0,
+      "completions/mean_length": 6649.5390625,
+      "completions/mean_terminated_length": 6170.794921875,
+      "completions/min_length": 599.0,
+      "completions/min_terminated_length": 599.0,
+      "entropy": 0.9003193452954292,
+      "epoch": 0.29622815087396503,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0025556792970746756,
+      "learning_rate": 1e-05,
+      "loss": 0.0366,
+      "num_tokens": 263078672.0,
+      "reward": 0.453125,
+      "reward_std": 0.3214184641838074,
+      "rewards/accuracy_reward/mean": 0.453125,
+      "rewards/accuracy_reward/std": 0.4997538626194,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998431205749512,
+      "sampling/importance_sampling_ratio/min": 3.631301660789177e-05,
+      "sampling/sampling_logp_difference/max": 10.223334312438965,
+      "sampling/sampling_logp_difference/mean": 0.019613387063145638,
+      "step": 322
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 1.1492368912513484e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 1.1492368912513484e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15190.0,
+      "completions/mean_length": 5819.4140625,
+      "completions/mean_terminated_length": 5478.62060546875,
+      "completions/min_length": 701.0,
+      "completions/min_terminated_length": 701.0,
+      "entropy": 0.9234923645853996,
+      "epoch": 0.297148114075437,
+      "frac_reward_zero_std": 0.6875,
+      "grad_norm": 0.0008845282136462629,
+      "learning_rate": 1e-05,
+      "loss": 0.077,
+      "num_tokens": 263843797.0,
+      "reward": 0.5390625,
+      "reward_std": 0.14913026988506317,
+      "rewards/accuracy_reward/mean": 0.5390625,
+      "rewards/accuracy_reward/std": 0.5004304051399231,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999452233314514,
+      "sampling/importance_sampling_ratio/min": 0.06759586930274963,
+      "sampling/sampling_logp_difference/max": 2.6942083835601807,
+      "sampling/sampling_logp_difference/mean": 0.02007308602333069,
+      "step": 323
+    },
+    {
+      "clip_ratio/high_max": 1.1687909363899962e-05,
+      "clip_ratio/high_mean": 2.9219773409749905e-06,
+      "clip_ratio/low_mean": 2.420720869622528e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.7129186207730527e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16033.0,
+      "completions/mean_length": 6952.96875,
+      "completions/mean_terminated_length": 6726.62451171875,
+      "completions/min_length": 506.0,
+      "completions/min_terminated_length": 506.0,
+      "entropy": 0.8909401148557663,
+      "epoch": 0.2980680772769089,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.001527746208012104,
+      "learning_rate": 1e-05,
+      "loss": 0.0633,
+      "num_tokens": 264751769.0,
+      "reward": 0.453125,
+      "reward_std": 0.23410367965698242,
+      "rewards/accuracy_reward/mean": 0.453125,
+      "rewards/accuracy_reward/std": 0.4997538626194,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999520778656006,
+      "sampling/importance_sampling_ratio/min": 0.000505264790263027,
+      "sampling/sampling_logp_difference/max": 7.590427875518799,
+      "sampling/sampling_logp_difference/mean": 0.019622590392827988,
+      "step": 324
+    },
+    {
+      "clip_ratio/high_max": 1.5079081094881985e-05,
+      "clip_ratio/high_mean": 4.600909505825257e-06,
+      "clip_ratio/low_mean": 5.333864191925386e-05,
+      "clip_ratio/low_min": 5.043169494456379e-06,
+      "clip_ratio/region_mean": 5.793955187982647e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15667.0,
+      "completions/mean_length": 8138.5234375,
+      "completions/mean_terminated_length": 7733.0078125,
+      "completions/min_length": 268.0,
+      "completions/min_terminated_length": 268.0,
+      "entropy": 0.972789965569973,
+      "epoch": 0.29898804047838085,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.003113618353381753,
+      "learning_rate": 1e-05,
+      "loss": 0.0771,
+      "num_tokens": 265810580.0,
+      "reward": 0.40625,
+      "reward_std": 0.27328526973724365,
+      "rewards/accuracy_reward/mean": 0.40625,
+      "rewards/accuracy_reward/std": 0.4930621087551117,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998828172683716,
+      "sampling/importance_sampling_ratio/min": 9.312124404914357e-08,
+      "sampling/sampling_logp_difference/max": 16.189363479614258,
+      "sampling/sampling_logp_difference/mean": 0.02168515883386135,
+      "step": 325
+    },
+    {
+      "clip_ratio/high_max": 4.463807272259146e-06,
+      "clip_ratio/high_mean": 1.1159518180647865e-06,
+      "clip_ratio/low_mean": 3.45970811395091e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.571303295757389e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16248.0,
+      "completions/mean_length": 7565.6015625,
+      "completions/mean_terminated_length": 7131.90966796875,
+      "completions/min_length": 1017.0,
+      "completions/min_terminated_length": 1017.0,
+      "entropy": 0.835600845515728,
+      "epoch": 0.2999080036798528,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.0009589543915353715,
+      "learning_rate": 1e-05,
+      "loss": 0.0509,
+      "num_tokens": 266796097.0,
+      "reward": 0.5078125,
+      "reward_std": 0.16834920644760132,
+      "rewards/accuracy_reward/mean": 0.5078125,
+      "rewards/accuracy_reward/std": 0.5019033551216125,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999743700027466,
+      "sampling/importance_sampling_ratio/min": 0.0017039220547303557,
+      "sampling/sampling_logp_difference/max": 6.374822616577148,
+      "sampling/sampling_logp_difference/mean": 0.01885361596941948,
+      "step": 326
+    },
+    {
+      "clip_ratio/high_max": 2.260646033391822e-05,
+      "clip_ratio/high_mean": 5.651615083479555e-06,
+      "clip_ratio/low_mean": 5.806843591926736e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 6.372005145749426e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16167.0,
+      "completions/mean_length": 7124.0546875,
+      "completions/mean_terminated_length": 6668.64697265625,
+      "completions/min_length": 428.0,
+      "completions/min_terminated_length": 428.0,
+      "entropy": 0.9041655585169792,
+      "epoch": 0.30082796688132474,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0024741124361753464,
+      "learning_rate": 1e-05,
+      "loss": 0.0514,
+      "num_tokens": 267727528.0,
+      "reward": 0.4296875,
+      "reward_std": 0.23592591285705566,
+      "rewards/accuracy_reward/mean": 0.4296875,
+      "rewards/accuracy_reward/std": 0.4969765841960907,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999866247177124,
+      "sampling/importance_sampling_ratio/min": 4.63160322397016e-05,
+      "sampling/sampling_logp_difference/max": 9.980022430419922,
+      "sampling/sampling_logp_difference/mean": 0.01998118683695793,
+      "step": 327
+    },
+    {
+      "clip_ratio/high_max": 1.7461054540035548e-05,
+      "clip_ratio/high_mean": 5.456775966194982e-06,
+      "clip_ratio/low_mean": 3.374219397755951e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.919897017112817e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14834.0,
+      "completions/mean_length": 6757.65625,
+      "completions/mean_terminated_length": 6681.8583984375,
+      "completions/min_length": 1123.0,
+      "completions/min_terminated_length": 1123.0,
+      "entropy": 1.105302907526493,
+      "epoch": 0.3017479300827967,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.002233455190435052,
+      "learning_rate": 1e-05,
+      "loss": 0.0147,
+      "num_tokens": 268610868.0,
+      "reward": 0.375,
+      "reward_std": 0.23857943713665009,
+      "rewards/accuracy_reward/mean": 0.375,
+      "rewards/accuracy_reward/std": 0.4860251843929291,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999549984931946,
+      "sampling/importance_sampling_ratio/min": 3.3169128528243164e-06,
+      "sampling/sampling_logp_difference/max": 12.616476058959961,
+      "sampling/sampling_logp_difference/mean": 0.021600255742669106,
+      "step": 328
+    },
+    {
+      "clip_ratio/high_max": 1.7514204046165105e-05,
+      "clip_ratio/high_mean": 4.378551011541276e-06,
+      "clip_ratio/low_mean": 4.300070588669769e-05,
+      "clip_ratio/low_min": 3.6705330330732977e-06,
+      "clip_ratio/region_mean": 4.7379256784552126e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16147.0,
+      "completions/mean_length": 7522.5546875,
+      "completions/mean_terminated_length": 7381.8974609375,
+      "completions/min_length": 1390.0,
+      "completions/min_terminated_length": 1390.0,
+      "entropy": 1.0577925741672516,
+      "epoch": 0.30266789328426863,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0017964976141229272,
+      "learning_rate": 1e-05,
+      "loss": 0.0845,
+      "num_tokens": 269594867.0,
+      "reward": 0.421875,
+      "reward_std": 0.28223684430122375,
+      "rewards/accuracy_reward/mean": 0.421875,
+      "rewards/accuracy_reward/std": 0.4957992732524872,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999937891960144,
+      "sampling/importance_sampling_ratio/min": 0.002247168216854334,
+      "sampling/sampling_logp_difference/max": 6.098084449768066,
+      "sampling/sampling_logp_difference/mean": 0.021326296031475067,
+      "step": 329
+    },
+    {
+      "clip_ratio/high_max": 1.7011016097967513e-05,
+      "clip_ratio/high_mean": 4.252754024491878e-06,
+      "clip_ratio/low_mean": 2.5991578013417893e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.0244332265283447e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14462.0,
+      "completions/mean_length": 6232.109375,
+      "completions/mean_terminated_length": 5904.62890625,
+      "completions/min_length": 1238.0,
+      "completions/min_terminated_length": 1238.0,
+      "entropy": 0.8473618850111961,
+      "epoch": 0.30358785648574055,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0023369218688458204,
+      "learning_rate": 1e-05,
+      "loss": 0.0291,
+      "num_tokens": 270410785.0,
+      "reward": 0.6015625,
+      "reward_std": 0.23516449332237244,
+      "rewards/accuracy_reward/mean": 0.6015625,
+      "rewards/accuracy_reward/std": 0.4915000796318054,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.000063180923462,
+      "sampling/importance_sampling_ratio/min": 0.00010575528722256422,
+      "sampling/sampling_logp_difference/max": 9.154382705688477,
+      "sampling/sampling_logp_difference/mean": 0.018453873693943024,
+      "step": 330
+    },
+    {
+      "clip_ratio/high_max": 1.2072427125531249e-05,
+      "clip_ratio/high_mean": 4.300789669287042e-06,
+      "clip_ratio/low_mean": 3.064826853460545e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.4949058090205654e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 14454.0,
+      "completions/max_terminated_length": 14454.0,
+      "completions/mean_length": 5847.0625,
+      "completions/mean_terminated_length": 5847.0625,
+      "completions/min_length": 27.0,
+      "completions/min_terminated_length": 27.0,
+      "entropy": 0.8186105340719223,
+      "epoch": 0.3045078196872125,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0014558705734089017,
+      "learning_rate": 1e-05,
+      "loss": 0.0672,
+      "num_tokens": 271179113.0,
+      "reward": 0.5390625,
+      "reward_std": 0.22673210501670837,
+      "rewards/accuracy_reward/mean": 0.5390625,
+      "rewards/accuracy_reward/std": 0.5004304051399231,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.000067114830017,
+      "sampling/importance_sampling_ratio/min": 1.994453305087518e-05,
+      "sampling/sampling_logp_difference/max": 10.822555541992188,
+      "sampling/sampling_logp_difference/mean": 0.017629161477088928,
+      "step": 331
+    },
+    {
+      "clip_ratio/high_max": 3.204624090358266e-05,
+      "clip_ratio/high_mean": 8.719567063053546e-06,
+      "clip_ratio/low_mean": 5.131868192620459e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 6.0038249102944974e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16065.0,
+      "completions/mean_length": 6670.6015625,
+      "completions/mean_terminated_length": 6516.4208984375,
+      "completions/min_length": 24.0,
+      "completions/min_terminated_length": 24.0,
+      "entropy": 0.9379853457212448,
+      "epoch": 0.30542778288868444,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.002496426459401846,
+      "learning_rate": 1e-05,
+      "loss": 0.051,
+      "num_tokens": 272054510.0,
+      "reward": 0.328125,
+      "reward_std": 0.29932624101638794,
+      "rewards/accuracy_reward/mean": 0.328125,
+      "rewards/accuracy_reward/std": 0.4713755249977112,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998705387115479,
+      "sampling/importance_sampling_ratio/min": 0.00010894420120166615,
+      "sampling/sampling_logp_difference/max": 9.124674797058105,
+      "sampling/sampling_logp_difference/mean": 0.020175442099571228,
+      "step": 332
+    },
+    {
+      "clip_ratio/high_max": 1.1311959497106727e-05,
+      "clip_ratio/high_mean": 2.827989874276682e-06,
+      "clip_ratio/low_mean": 6.672416202491149e-05,
+      "clip_ratio/low_min": 4.344501576269977e-06,
+      "clip_ratio/region_mean": 6.955215212656185e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15716.0,
+      "completions/max_terminated_length": 15716.0,
+      "completions/mean_length": 6613.328125,
+      "completions/mean_terminated_length": 6613.328125,
+      "completions/min_length": 439.0,
+      "completions/min_terminated_length": 439.0,
+      "entropy": 1.0781218782067299,
+      "epoch": 0.3063477460901564,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0028466631192713976,
+      "learning_rate": 1e-05,
+      "loss": 0.0257,
+      "num_tokens": 272920304.0,
+      "reward": 0.3359375,
+      "reward_std": 0.32089439034461975,
+      "rewards/accuracy_reward/mean": 0.3359375,
+      "rewards/accuracy_reward/std": 0.47417303919792175,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999476671218872,
+      "sampling/importance_sampling_ratio/min": 0.02985518053174019,
+      "sampling/sampling_logp_difference/max": 3.511396884918213,
+      "sampling/sampling_logp_difference/mean": 0.02250460349023342,
+      "step": 333
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 2.3429964585375274e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.3429964585375274e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15572.0,
+      "completions/mean_length": 6632.78125,
+      "completions/mean_terminated_length": 6318.2255859375,
+      "completions/min_length": 888.0,
+      "completions/min_terminated_length": 888.0,
+      "entropy": 0.9595735669136047,
+      "epoch": 0.30726770929162833,
+      "frac_reward_zero_std": 0.6875,
+      "grad_norm": 0.0027409526519477367,
+      "learning_rate": 1e-05,
+      "loss": 0.0564,
+      "num_tokens": 273789588.0,
+      "reward": 0.3671875,
+      "reward_std": 0.12863078713417053,
+      "rewards/accuracy_reward/mean": 0.3671875,
+      "rewards/accuracy_reward/std": 0.4839322865009308,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999409914016724,
+      "sampling/importance_sampling_ratio/min": 8.484355930704623e-05,
+      "sampling/sampling_logp_difference/max": 9.374701499938965,
+      "sampling/sampling_logp_difference/mean": 0.02000725269317627,
+      "step": 334
+    },
+    {
+      "clip_ratio/high_max": 1.0485138318472309e-05,
+      "clip_ratio/high_mean": 2.6212845796180773e-06,
+      "clip_ratio/low_mean": 6.270217818382662e-05,
+      "clip_ratio/low_min": 1.282997527596308e-05,
+      "clip_ratio/region_mean": 6.532346287713153e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15567.0,
+      "completions/mean_length": 8083.421875,
+      "completions/mean_terminated_length": 7884.20849609375,
+      "completions/min_length": 631.0,
+      "completions/min_terminated_length": 631.0,
+      "entropy": 1.139024168252945,
+      "epoch": 0.30818767249310025,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.001853835303336382,
+      "learning_rate": 1e-05,
+      "loss": 0.0521,
+      "num_tokens": 274843754.0,
+      "reward": 0.2734375,
+      "reward_std": 0.29719969630241394,
+      "rewards/accuracy_reward/mean": 0.2734375,
+      "rewards/accuracy_reward/std": 0.447474867105484,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999961256980896,
+      "sampling/importance_sampling_ratio/min": 6.099340225773631e-06,
+      "sampling/sampling_logp_difference/max": 12.007329940795898,
+      "sampling/sampling_logp_difference/mean": 0.023757295683026314,
+      "step": 335
+    },
+    {
+      "clip_ratio/high_max": 6.558237146236934e-06,
+      "clip_ratio/high_mean": 1.6395592865592334e-06,
+      "clip_ratio/low_mean": 3.2649955073793535e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.428951481510012e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 16058.0,
+      "completions/max_terminated_length": 16058.0,
+      "completions/mean_length": 6932.6640625,
+      "completions/mean_terminated_length": 6932.6640625,
+      "completions/min_length": 666.0,
+      "completions/min_terminated_length": 666.0,
+      "entropy": 1.2969390451908112,
+      "epoch": 0.3091076356945722,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.002049664966762066,
+      "learning_rate": 1e-05,
+      "loss": 0.0179,
+      "num_tokens": 275750023.0,
+      "reward": 0.21875,
+      "reward_std": 0.23934084177017212,
+      "rewards/accuracy_reward/mean": 0.21875,
+      "rewards/accuracy_reward/std": 0.41502299904823303,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000157356262207,
+      "sampling/importance_sampling_ratio/min": 5.287989188218489e-05,
+      "sampling/sampling_logp_difference/max": 9.847487449645996,
+      "sampling/sampling_logp_difference/mean": 0.021840902045369148,
+      "step": 336
+    },
+    {
+      "clip_ratio/high_max": 5.1826359594997484e-06,
+      "clip_ratio/high_mean": 1.2956589898749371e-06,
+      "clip_ratio/low_mean": 3.607215444390022e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.736781377483567e-05,
+      "completions/clipped_ratio": 0.0546875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15213.0,
+      "completions/mean_length": 7630.65625,
+      "completions/mean_terminated_length": 7124.26416015625,
+      "completions/min_length": 1002.0,
+      "completions/min_terminated_length": 1002.0,
+      "entropy": 0.959126852452755,
+      "epoch": 0.31002759889604414,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0030745298136025667,
+      "learning_rate": 1e-05,
+      "loss": 0.0487,
+      "num_tokens": 276750011.0,
+      "reward": 0.3125,
+      "reward_std": 0.30091896653175354,
+      "rewards/accuracy_reward/mean": 0.3125,
+      "rewards/accuracy_reward/std": 0.4653336703777313,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999728798866272,
+      "sampling/importance_sampling_ratio/min": 3.149233089061454e-05,
+      "sampling/sampling_logp_difference/max": 10.365766525268555,
+      "sampling/sampling_logp_difference/mean": 0.021394159644842148,
+      "step": 337
+    },
+    {
+      "clip_ratio/high_max": 6.921764679646003e-06,
+      "clip_ratio/high_mean": 2.5604765028219845e-06,
+      "clip_ratio/low_mean": 2.64957521380893e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.905622847038103e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15907.0,
+      "completions/mean_length": 7383.2421875,
+      "completions/mean_terminated_length": 7240.37353515625,
+      "completions/min_length": 432.0,
+      "completions/min_terminated_length": 432.0,
+      "entropy": 1.1512386053800583,
+      "epoch": 0.3109475620975161,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0014476332580670714,
+      "learning_rate": 1e-05,
+      "loss": 0.0686,
+      "num_tokens": 277715450.0,
+      "reward": 0.4140625,
+      "reward_std": 0.2477683424949646,
+      "rewards/accuracy_reward/mean": 0.4140625,
+      "rewards/accuracy_reward/std": 0.49449479579925537,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999750256538391,
+      "sampling/importance_sampling_ratio/min": 4.5251621486386284e-05,
+      "sampling/sampling_logp_difference/max": 10.00327205657959,
+      "sampling/sampling_logp_difference/mean": 0.020672230049967766,
+      "step": 338
+    },
+    {
+      "clip_ratio/high_max": 3.7021679872850655e-06,
+      "clip_ratio/high_mean": 9.255419968212664e-07,
+      "clip_ratio/low_mean": 3.8645826748506806e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.957136880217149e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14693.0,
+      "completions/mean_length": 5876.421875,
+      "completions/mean_terminated_length": 5793.68505859375,
+      "completions/min_length": 116.0,
+      "completions/min_terminated_length": 116.0,
+      "entropy": 1.0786077454686165,
+      "epoch": 0.31186752529898804,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.0018895689863711596,
+      "learning_rate": 1e-05,
+      "loss": 0.0067,
+      "num_tokens": 278491688.0,
+      "reward": 0.3984375,
+      "reward_std": 0.21146979928016663,
+      "rewards/accuracy_reward/mean": 0.3984375,
+      "rewards/accuracy_reward/std": 0.4915000796318054,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998824596405029,
+      "sampling/importance_sampling_ratio/min": 0.0007111100130714476,
+      "sampling/sampling_logp_difference/max": 7.248683452606201,
+      "sampling/sampling_logp_difference/mean": 0.020282316952943802,
+      "step": 339
+    },
+    {
+      "clip_ratio/high_max": 1.8740533050731756e-05,
+      "clip_ratio/high_mean": 4.685133262682939e-06,
+      "clip_ratio/low_mean": 2.9699310402975243e-05,
+      "clip_ratio/low_min": 4.435140454006614e-06,
+      "clip_ratio/region_mean": 3.4384443438284507e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14627.0,
+      "completions/mean_length": 7162.625,
+      "completions/mean_terminated_length": 6709.1142578125,
+      "completions/min_length": 986.0,
+      "completions/min_terminated_length": 986.0,
+      "entropy": 0.898807168006897,
+      "epoch": 0.31278748850046,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.002133915899321437,
+      "learning_rate": 1e-05,
+      "loss": 0.0222,
+      "num_tokens": 279427384.0,
+      "reward": 0.4453125,
+      "reward_std": 0.32142335176467896,
+      "rewards/accuracy_reward/mean": 0.4453125,
+      "rewards/accuracy_reward/std": 0.4989531338214874,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000159740447998,
+      "sampling/importance_sampling_ratio/min": 0.004845126066356897,
+      "sampling/sampling_logp_difference/max": 5.329782009124756,
+      "sampling/sampling_logp_difference/mean": 0.019643021747469902,
+      "step": 340
+    },
+    {
+      "clip_ratio/high_max": 1.472241683586617e-05,
+      "clip_ratio/high_mean": 5.561973125622899e-06,
+      "clip_ratio/low_mean": 6.452910844245707e-05,
+      "clip_ratio/low_min": 9.302988473791629e-06,
+      "clip_ratio/region_mean": 7.009108327338254e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15965.0,
+      "completions/mean_length": 7072.3828125,
+      "completions/mean_terminated_length": 6999.06298828125,
+      "completions/min_length": 93.0,
+      "completions/min_terminated_length": 93.0,
+      "entropy": 0.8942967653274536,
+      "epoch": 0.3137074517019319,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0023624920286238194,
+      "learning_rate": 1e-05,
+      "loss": 0.0866,
+      "num_tokens": 280352177.0,
+      "reward": 0.375,
+      "reward_std": 0.36637401580810547,
+      "rewards/accuracy_reward/mean": 0.375,
+      "rewards/accuracy_reward/std": 0.4860251843929291,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999604225158691,
+      "sampling/importance_sampling_ratio/min": 0.0008250995306298137,
+      "sampling/sampling_logp_difference/max": 7.100006580352783,
+      "sampling/sampling_logp_difference/mean": 0.020037520676851273,
+      "step": 341
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 4.717265596809739e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.717265596809739e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 16300.0,
+      "completions/max_terminated_length": 16300.0,
+      "completions/mean_length": 6553.203125,
+      "completions/mean_terminated_length": 6553.203125,
+      "completions/min_length": 664.0,
+      "completions/min_terminated_length": 664.0,
+      "entropy": 0.8765531405806541,
+      "epoch": 0.31462741490340385,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0025228122249245644,
+      "learning_rate": 1e-05,
+      "loss": 0.0539,
+      "num_tokens": 281208411.0,
+      "reward": 0.40625,
+      "reward_std": 0.3390446603298187,
+      "rewards/accuracy_reward/mean": 0.40625,
+      "rewards/accuracy_reward/std": 0.4930621087551117,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999656677246094,
+      "sampling/importance_sampling_ratio/min": 0.00030091358348727226,
+      "sampling/sampling_logp_difference/max": 8.108687400817871,
+      "sampling/sampling_logp_difference/mean": 0.018958289176225662,
+      "step": 342
+    },
+    {
+      "clip_ratio/high_max": 1.5562100998067763e-05,
+      "clip_ratio/high_mean": 3.890525249516941e-06,
+      "clip_ratio/low_mean": 6.593948137378902e-05,
+      "clip_ratio/low_min": 1.4238520634535234e-05,
+      "clip_ratio/region_mean": 6.983000685067964e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14916.0,
+      "completions/mean_length": 6489.40625,
+      "completions/mean_terminated_length": 6087.1865234375,
+      "completions/min_length": 374.0,
+      "completions/min_terminated_length": 374.0,
+      "entropy": 0.8384068235754967,
+      "epoch": 0.3155473781048758,
+      "frac_reward_zero_std": 0.125,
+      "grad_norm": 0.003243578365072608,
+      "learning_rate": 1e-05,
+      "loss": 0.119,
+      "num_tokens": 282059863.0,
+      "reward": 0.515625,
+      "reward_std": 0.39689862728118896,
+      "rewards/accuracy_reward/mean": 0.515625,
+      "rewards/accuracy_reward/std": 0.5017194747924805,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999314546585083,
+      "sampling/importance_sampling_ratio/min": 0.00026549631729722023,
+      "sampling/sampling_logp_difference/max": 8.233909606933594,
+      "sampling/sampling_logp_difference/mean": 0.01820875145494938,
+      "step": 343
+    },
+    {
+      "clip_ratio/high_max": 4.114007424504962e-06,
+      "clip_ratio/high_mean": 1.0285018561262405e-06,
+      "clip_ratio/low_mean": 3.0735714062757324e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.176421569150989e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15859.0,
+      "completions/max_terminated_length": 15859.0,
+      "completions/mean_length": 7148.7890625,
+      "completions/mean_terminated_length": 7148.7890625,
+      "completions/min_length": 252.0,
+      "completions/min_terminated_length": 252.0,
+      "entropy": 1.0214989855885506,
+      "epoch": 0.31646734130634774,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0027867467142641544,
+      "learning_rate": 1e-05,
+      "loss": 0.0445,
+      "num_tokens": 282994036.0,
+      "reward": 0.4921875,
+      "reward_std": 0.28511500358581543,
+      "rewards/accuracy_reward/mean": 0.4921875,
+      "rewards/accuracy_reward/std": 0.5019033551216125,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999145269393921,
+      "sampling/importance_sampling_ratio/min": 0.027774186804890633,
+      "sampling/sampling_logp_difference/max": 3.583648204803467,
+      "sampling/sampling_logp_difference/mean": 0.0217401385307312,
+      "step": 344
+    },
+    {
+      "clip_ratio/high_max": 1.6063933799159713e-05,
+      "clip_ratio/high_mean": 5.513276278179546e-06,
+      "clip_ratio/low_mean": 4.230772367463942e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.782100086231367e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 16091.0,
+      "completions/max_terminated_length": 16091.0,
+      "completions/mean_length": 5532.1328125,
+      "completions/mean_terminated_length": 5532.1328125,
+      "completions/min_length": 467.0,
+      "completions/min_terminated_length": 467.0,
+      "entropy": 0.9303388148546219,
+      "epoch": 0.3173873045078197,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.0024432060308754444,
+      "learning_rate": 1e-05,
+      "loss": 0.0251,
+      "num_tokens": 283723605.0,
+      "reward": 0.421875,
+      "reward_std": 0.38717782497406006,
+      "rewards/accuracy_reward/mean": 0.421875,
+      "rewards/accuracy_reward/std": 0.4957992732524872,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999819993972778,
+      "sampling/importance_sampling_ratio/min": 0.011936242692172527,
+      "sampling/sampling_logp_difference/max": 4.428175926208496,
+      "sampling/sampling_logp_difference/mean": 0.019281461834907532,
+      "step": 345
+    },
+    {
+      "clip_ratio/high_max": 6.218693215487292e-06,
+      "clip_ratio/high_mean": 1.554673303871823e-06,
+      "clip_ratio/low_mean": 1.5384349637770356e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 1.6939022600581666e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15933.0,
+      "completions/mean_length": 6830.09375,
+      "completions/mean_terminated_length": 6441.72314453125,
+      "completions/min_length": 503.0,
+      "completions/min_terminated_length": 503.0,
+      "entropy": 0.9551377296447754,
+      "epoch": 0.31830726770929163,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.0031446516513824463,
+      "learning_rate": 1e-05,
+      "loss": -0.0037,
+      "num_tokens": 284617089.0,
+      "reward": 0.3671875,
+      "reward_std": 0.20911568403244019,
+      "rewards/accuracy_reward/mean": 0.3671875,
+      "rewards/accuracy_reward/std": 0.4839322865009308,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999873042106628,
+      "sampling/importance_sampling_ratio/min": 0.0007485119276680052,
+      "sampling/sampling_logp_difference/max": 7.197423458099365,
+      "sampling/sampling_logp_difference/mean": 0.01985902711749077,
+      "step": 346
+    },
+    {
+      "clip_ratio/high_max": 7.772906428726856e-06,
+      "clip_ratio/high_mean": 2.8712697712762747e-06,
+      "clip_ratio/low_mean": 3.287052913947264e-05,
+      "clip_ratio/low_min": 2.789369091260596e-06,
+      "clip_ratio/region_mean": 3.574179936549626e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15613.0,
+      "completions/mean_length": 6557.3515625,
+      "completions/mean_terminated_length": 6401.37353515625,
+      "completions/min_length": 385.0,
+      "completions/min_terminated_length": 385.0,
+      "entropy": 1.0254710763692856,
+      "epoch": 0.31922723091076355,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0024617225863039494,
+      "learning_rate": 1e-05,
+      "loss": 0.0669,
+      "num_tokens": 285475910.0,
+      "reward": 0.390625,
+      "reward_std": 0.2761683464050293,
+      "rewards/accuracy_reward/mean": 0.390625,
+      "rewards/accuracy_reward/std": 0.4898075461387634,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999537467956543,
+      "sampling/importance_sampling_ratio/min": 0.006225659977644682,
+      "sampling/sampling_logp_difference/max": 5.079075813293457,
+      "sampling/sampling_logp_difference/mean": 0.021138068288564682,
+      "step": 347
+    },
+    {
+      "clip_ratio/high_max": 1.0258745533064939e-05,
+      "clip_ratio/high_mean": 3.588538106669148e-06,
+      "clip_ratio/low_mean": 6.333507008093875e-05,
+      "clip_ratio/low_min": 4.415712737682043e-06,
+      "clip_ratio/region_mean": 6.692360875604209e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15851.0,
+      "completions/mean_length": 7379.140625,
+      "completions/mean_terminated_length": 7088.6611328125,
+      "completions/min_length": 1243.0,
+      "completions/min_terminated_length": 1243.0,
+      "entropy": 0.9518962875008583,
+      "epoch": 0.3201471941122355,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0017496495274826884,
+      "learning_rate": 1e-05,
+      "loss": 0.0734,
+      "num_tokens": 286439696.0,
+      "reward": 0.390625,
+      "reward_std": 0.26538965106010437,
+      "rewards/accuracy_reward/mean": 0.390625,
+      "rewards/accuracy_reward/std": 0.4898075461387634,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999358654022217,
+      "sampling/importance_sampling_ratio/min": 0.006735759321600199,
+      "sampling/sampling_logp_difference/max": 5.000324726104736,
+      "sampling/sampling_logp_difference/mean": 0.021384600549936295,
+      "step": 348
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 2.854056094747648e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.854056094747648e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16123.0,
+      "completions/mean_length": 5960.140625,
+      "completions/mean_terminated_length": 5878.06298828125,
+      "completions/min_length": 833.0,
+      "completions/min_terminated_length": 833.0,
+      "entropy": 0.9556702002882957,
+      "epoch": 0.32106715731370744,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0013999518705531955,
+      "learning_rate": 1e-05,
+      "loss": 0.0484,
+      "num_tokens": 287226394.0,
+      "reward": 0.3515625,
+      "reward_std": 0.20175683498382568,
+      "rewards/accuracy_reward/mean": 0.3515625,
+      "rewards/accuracy_reward/std": 0.4793342351913452,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999549984931946,
+      "sampling/importance_sampling_ratio/min": 8.140038517012727e-06,
+      "sampling/sampling_logp_difference/max": 11.71871566772461,
+      "sampling/sampling_logp_difference/mean": 0.01937047764658928,
+      "step": 349
+    },
+    {
+      "clip_ratio/high_max": 8.395007171202451e-06,
+      "clip_ratio/high_mean": 2.0987517928006127e-06,
+      "clip_ratio/low_mean": 3.610323426528339e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.820198628545768e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 12561.0,
+      "completions/mean_length": 5387.546875,
+      "completions/mean_terminated_length": 5300.96044921875,
+      "completions/min_length": 464.0,
+      "completions/min_terminated_length": 464.0,
+      "entropy": 0.95712860673666,
+      "epoch": 0.3219871205151794,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.004228786565363407,
+      "learning_rate": 1e-05,
+      "loss": 0.0692,
+      "num_tokens": 287935952.0,
+      "reward": 0.5234375,
+      "reward_std": 0.29378965497016907,
+      "rewards/accuracy_reward/mean": 0.5234375,
+      "rewards/accuracy_reward/std": 0.5014128684997559,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000360012054443,
+      "sampling/importance_sampling_ratio/min": 0.005966294556856155,
+      "sampling/sampling_logp_difference/max": 5.121629238128662,
+      "sampling/sampling_logp_difference/mean": 0.020441649481654167,
+      "step": 350
+    },
+    {
+      "clip_ratio/high_max": 1.2559637070808094e-05,
+      "clip_ratio/high_mean": 3.1399092677020235e-06,
+      "clip_ratio/low_mean": 2.673440690159623e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.9874316624045605e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15957.0,
+      "completions/mean_length": 5799.625,
+      "completions/mean_terminated_length": 5716.283203125,
+      "completions/min_length": 415.0,
+      "completions/min_terminated_length": 415.0,
+      "entropy": 0.9457403644919395,
+      "epoch": 0.32290708371665133,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.0029834613669663668,
+      "learning_rate": 1e-05,
+      "loss": 0.0589,
+      "num_tokens": 288696000.0,
+      "reward": 0.4921875,
+      "reward_std": 0.3884710967540741,
+      "rewards/accuracy_reward/mean": 0.4921875,
+      "rewards/accuracy_reward/std": 0.5019033551216125,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999950528144836,
+      "sampling/importance_sampling_ratio/min": 0.0011352180736139417,
+      "sampling/sampling_logp_difference/max": 6.780930519104004,
+      "sampling/sampling_logp_difference/mean": 0.021189026534557343,
+      "step": 351
+    },
+    {
+      "clip_ratio/high_max": 6.2518756749341264e-06,
+      "clip_ratio/high_mean": 1.5629689187335316e-06,
+      "clip_ratio/low_mean": 3.849920358334202e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.0062172047328204e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16148.0,
+      "completions/mean_length": 7058.6875,
+      "completions/mean_terminated_length": 6757.87060546875,
+      "completions/min_length": 799.0,
+      "completions/min_terminated_length": 799.0,
+      "entropy": 0.8782663866877556,
+      "epoch": 0.32382704691812325,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.002617151942104101,
+      "learning_rate": 1e-05,
+      "loss": 0.0874,
+      "num_tokens": 289618904.0,
+      "reward": 0.3515625,
+      "reward_std": 0.28353992104530334,
+      "rewards/accuracy_reward/mean": 0.3515625,
+      "rewards/accuracy_reward/std": 0.4793342351913452,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999942779541016,
+      "sampling/importance_sampling_ratio/min": 0.001438659499399364,
+      "sampling/sampling_logp_difference/max": 6.54404354095459,
+      "sampling/sampling_logp_difference/mean": 0.019699860364198685,
+      "step": 352
+    },
+    {
+      "clip_ratio/high_max": 1.8079134861181956e-05,
+      "clip_ratio/high_mean": 4.519783715295489e-06,
+      "clip_ratio/low_mean": 6.639697721766424e-05,
+      "clip_ratio/low_min": 1.0295151696482208e-05,
+      "clip_ratio/region_mean": 7.091676206982811e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15519.0,
+      "completions/mean_length": 6609.953125,
+      "completions/mean_terminated_length": 6454.81005859375,
+      "completions/min_length": 500.0,
+      "completions/min_terminated_length": 500.0,
+      "entropy": 0.8895087689161301,
+      "epoch": 0.3247470101195952,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.0021503251045942307,
+      "learning_rate": 1e-05,
+      "loss": 0.044,
+      "num_tokens": 290484378.0,
+      "reward": 0.3671875,
+      "reward_std": 0.35324612259864807,
+      "rewards/accuracy_reward/mean": 0.3671875,
+      "rewards/accuracy_reward/std": 0.4839322865009308,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999619722366333,
+      "sampling/importance_sampling_ratio/min": 5.448641240946017e-05,
+      "sampling/sampling_logp_difference/max": 9.817559242248535,
+      "sampling/sampling_logp_difference/mean": 0.0200796015560627,
+      "step": 353
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 6.141278026916552e-05,
+      "clip_ratio/low_min": 1.333249815616e-05,
+      "clip_ratio/region_mean": 6.141278026916552e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16284.0,
+      "completions/mean_length": 7872.4921875,
+      "completions/mean_terminated_length": 7453.89306640625,
+      "completions/min_length": 328.0,
+      "completions/min_terminated_length": 328.0,
+      "entropy": 0.9183534607291222,
+      "epoch": 0.32566697332106714,
+      "frac_reward_zero_std": 0.125,
+      "grad_norm": 0.0023925534915179014,
+      "learning_rate": 1e-05,
+      "loss": 0.0895,
+      "num_tokens": 291512393.0,
+      "reward": 0.34375,
+      "reward_std": 0.3763991594314575,
+      "rewards/accuracy_reward/mean": 0.34375,
+      "rewards/accuracy_reward/std": 0.47682511806488037,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999232292175293,
+      "sampling/importance_sampling_ratio/min": 0.0004287353658583015,
+      "sampling/sampling_logp_difference/max": 7.7546706199646,
+      "sampling/sampling_logp_difference/mean": 0.020358648151159286,
+      "step": 354
+    },
+    {
+      "clip_ratio/high_max": 1.0912609013757901e-05,
+      "clip_ratio/high_mean": 3.7178592720010784e-06,
+      "clip_ratio/low_mean": 1.995230707052542e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.367016588777915e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15485.0,
+      "completions/mean_length": 6605.6640625,
+      "completions/mean_terminated_length": 6290.23388671875,
+      "completions/min_length": 269.0,
+      "completions/min_terminated_length": 269.0,
+      "entropy": 0.9602678120136261,
+      "epoch": 0.3265869365225391,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0018709113355726004,
+      "learning_rate": 1e-05,
+      "loss": 0.0642,
+      "num_tokens": 292380390.0,
+      "reward": 0.515625,
+      "reward_std": 0.26303553581237793,
+      "rewards/accuracy_reward/mean": 0.515625,
+      "rewards/accuracy_reward/std": 0.5017194747924805,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999732375144958,
+      "sampling/importance_sampling_ratio/min": 6.221406168016586e-10,
+      "sampling/sampling_logp_difference/max": 21.19785499572754,
+      "sampling/sampling_logp_difference/mean": 0.02150166593492031,
+      "step": 355
+    },
+    {
+      "clip_ratio/high_max": 2.202200403189636e-05,
+      "clip_ratio/high_mean": 6.279054105107207e-06,
+      "clip_ratio/low_mean": 5.168271604816255e-05,
+      "clip_ratio/low_min": 7.731559890089557e-06,
+      "clip_ratio/region_mean": 5.796177038064343e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 13477.0,
+      "completions/max_terminated_length": 13477.0,
+      "completions/mean_length": 6677.8828125,
+      "completions/mean_terminated_length": 6677.8828125,
+      "completions/min_length": 754.0,
+      "completions/min_terminated_length": 754.0,
+      "entropy": 1.001693107187748,
+      "epoch": 0.32750689972401104,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0017649955116212368,
+      "learning_rate": 1e-05,
+      "loss": 0.0502,
+      "num_tokens": 293255287.0,
+      "reward": 0.3203125,
+      "reward_std": 0.2845958471298218,
+      "rewards/accuracy_reward/mean": 0.3203125,
+      "rewards/accuracy_reward/std": 0.4684300124645233,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998878240585327,
+      "sampling/importance_sampling_ratio/min": 0.0027159738820046186,
+      "sampling/sampling_logp_difference/max": 5.908604621887207,
+      "sampling/sampling_logp_difference/mean": 0.020375655964016914,
+      "step": 356
+    },
+    {
+      "clip_ratio/high_max": 5.7686097534315195e-06,
+      "clip_ratio/high_mean": 2.223324372607749e-06,
+      "clip_ratio/low_mean": 2.7612236522145395e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.9835560894753144e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15925.0,
+      "completions/mean_length": 6210.6953125,
+      "completions/mean_terminated_length": 6049.21484375,
+      "completions/min_length": 870.0,
+      "completions/min_terminated_length": 870.0,
+      "entropy": 0.9842480793595314,
+      "epoch": 0.32842686292548295,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0024816791992634535,
+      "learning_rate": 1e-05,
+      "loss": 0.0637,
+      "num_tokens": 294069184.0,
+      "reward": 0.4140625,
+      "reward_std": 0.2845958471298218,
+      "rewards/accuracy_reward/mean": 0.4140625,
+      "rewards/accuracy_reward/std": 0.49449479579925537,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000953674316406,
+      "sampling/importance_sampling_ratio/min": 0.0047831060364842415,
+      "sampling/sampling_logp_difference/max": 5.342665195465088,
+      "sampling/sampling_logp_difference/mean": 0.021009165793657303,
+      "step": 357
+    },
+    {
+      "clip_ratio/high_max": 5.0844009820139036e-06,
+      "clip_ratio/high_mean": 1.2711002455034759e-06,
+      "clip_ratio/low_mean": 4.299241186345171e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.426351074471313e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16085.0,
+      "completions/mean_length": 6876.0546875,
+      "completions/mean_terminated_length": 6725.13525390625,
+      "completions/min_length": 206.0,
+      "completions/min_terminated_length": 206.0,
+      "entropy": 0.8680268228054047,
+      "epoch": 0.32934682612695493,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0030787813011556864,
+      "learning_rate": 1e-05,
+      "loss": 0.1096,
+      "num_tokens": 294969111.0,
+      "reward": 0.4921875,
+      "reward_std": 0.3514111638069153,
+      "rewards/accuracy_reward/mean": 0.4921875,
+      "rewards/accuracy_reward/std": 0.5019033551216125,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999387264251709,
+      "sampling/importance_sampling_ratio/min": 0.0036591701209545135,
+      "sampling/sampling_logp_difference/max": 5.610518932342529,
+      "sampling/sampling_logp_difference/mean": 0.019419874995946884,
+      "step": 358
+    },
+    {
+      "clip_ratio/high_max": 5.279830929794116e-06,
+      "clip_ratio/high_mean": 1.319957732448529e-06,
+      "clip_ratio/low_mean": 3.3445195754211454e-05,
+      "clip_ratio/low_min": 3.1955414669937454e-06,
+      "clip_ratio/region_mean": 3.476515314559947e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16154.0,
+      "completions/mean_length": 7079.7734375,
+      "completions/mean_terminated_length": 6932.087890625,
+      "completions/min_length": 973.0,
+      "completions/min_terminated_length": 973.0,
+      "entropy": 1.0033101588487625,
+      "epoch": 0.33026678932842685,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.0027940638829022646,
+      "learning_rate": 1e-05,
+      "loss": 0.1352,
+      "num_tokens": 295894682.0,
+      "reward": 0.4140625,
+      "reward_std": 0.40319663286209106,
+      "rewards/accuracy_reward/mean": 0.4140625,
+      "rewards/accuracy_reward/std": 0.49449479579925537,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999890923500061,
+      "sampling/importance_sampling_ratio/min": 0.00033553718822076917,
+      "sampling/sampling_logp_difference/max": 7.999777793884277,
+      "sampling/sampling_logp_difference/mean": 0.021608728915452957,
+      "step": 359
+    },
+    {
+      "clip_ratio/high_max": 4.0542295209888835e-06,
+      "clip_ratio/high_mean": 1.0135573802472209e-06,
+      "clip_ratio/low_mean": 3.935158406420669e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.0365141785514425e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14374.0,
+      "completions/mean_length": 6487.421875,
+      "completions/mean_terminated_length": 6249.904296875,
+      "completions/min_length": 637.0,
+      "completions/min_terminated_length": 637.0,
+      "entropy": 0.9404204189777374,
+      "epoch": 0.3311867525298988,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0021709369029849768,
+      "learning_rate": 1e-05,
+      "loss": 0.0479,
+      "num_tokens": 296744216.0,
+      "reward": 0.4296875,
+      "reward_std": 0.31800350546836853,
+      "rewards/accuracy_reward/mean": 0.4296875,
+      "rewards/accuracy_reward/std": 0.4969765841960907,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000174045562744,
+      "sampling/importance_sampling_ratio/min": 0.00012341697583906353,
+      "sampling/sampling_logp_difference/max": 8.9999418258667,
+      "sampling/sampling_logp_difference/mean": 0.02024281956255436,
+      "step": 360
+    },
+    {
+      "clip_ratio/high_max": 2.4414162908215076e-05,
+      "clip_ratio/high_mean": 6.103540727053769e-06,
+      "clip_ratio/low_mean": 2.0490186102506414e-05,
+      "clip_ratio/low_min": 2.8498473056970397e-06,
+      "clip_ratio/region_mean": 2.6593726602186507e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14584.0,
+      "completions/mean_length": 6668.1953125,
+      "completions/mean_terminated_length": 6273.24365234375,
+      "completions/min_length": 567.0,
+      "completions/min_terminated_length": 567.0,
+      "entropy": 0.8671490699052811,
+      "epoch": 0.33210671573137074,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0018110686214640737,
+      "learning_rate": 1e-05,
+      "loss": -0.0018,
+      "num_tokens": 297617937.0,
+      "reward": 0.4765625,
+      "reward_std": 0.22673210501670837,
+      "rewards/accuracy_reward/mean": 0.4765625,
+      "rewards/accuracy_reward/std": 0.5014128684997559,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999983549118042,
+      "sampling/importance_sampling_ratio/min": 0.0003801324055530131,
+      "sampling/sampling_logp_difference/max": 7.874990940093994,
+      "sampling/sampling_logp_difference/mean": 0.01934785582125187,
+      "step": 361
+    },
+    {
+      "clip_ratio/high_max": 8.66071218297293e-06,
+      "clip_ratio/high_mean": 2.1651780457432324e-06,
+      "clip_ratio/low_mean": 2.4539695857583865e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.6704873903327098e-05,
+      "completions/clipped_ratio": 0.0703125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15979.0,
+      "completions/mean_length": 8579.9921875,
+      "completions/mean_terminated_length": 7989.7734375,
+      "completions/min_length": 363.0,
+      "completions/min_terminated_length": 363.0,
+      "entropy": 1.0337364450097084,
+      "epoch": 0.3330266789328427,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.0014365602983161807,
+      "learning_rate": 1e-05,
+      "loss": 0.045,
+      "num_tokens": 298736304.0,
+      "reward": 0.1953125,
+      "reward_std": 0.1999218761920929,
+      "rewards/accuracy_reward/mean": 0.1953125,
+      "rewards/accuracy_reward/std": 0.3979988098144531,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999146461486816,
+      "sampling/importance_sampling_ratio/min": 0.0014037116197869182,
+      "sampling/sampling_logp_difference/max": 6.5686354637146,
+      "sampling/sampling_logp_difference/mean": 0.021067796275019646,
+      "step": 362
+    },
+    {
+      "clip_ratio/high_max": 7.748803682261496e-06,
+      "clip_ratio/high_mean": 1.937200920565374e-06,
+      "clip_ratio/low_mean": 5.063434127805522e-05,
+      "clip_ratio/low_min": 9.66116931522265e-06,
+      "clip_ratio/region_mean": 5.257154271021136e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16368.0,
+      "completions/mean_length": 7000.8203125,
+      "completions/mean_terminated_length": 6926.93701171875,
+      "completions/min_length": 456.0,
+      "completions/min_terminated_length": 456.0,
+      "entropy": 0.8918163478374481,
+      "epoch": 0.33394664213431463,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.003008107887580991,
+      "learning_rate": 1e-05,
+      "loss": 0.0862,
+      "num_tokens": 299653249.0,
+      "reward": 0.453125,
+      "reward_std": 0.3322049677371979,
+      "rewards/accuracy_reward/mean": 0.453125,
+      "rewards/accuracy_reward/std": 0.4997538626194,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999739527702332,
+      "sampling/importance_sampling_ratio/min": 0.002478980226442218,
+      "sampling/sampling_logp_difference/max": 5.999907970428467,
+      "sampling/sampling_logp_difference/mean": 0.020022090524435043,
+      "step": 363
+    },
+    {
+      "clip_ratio/high_max": 1.5043352505017538e-05,
+      "clip_ratio/high_mean": 3.7608381262543844e-06,
+      "clip_ratio/low_mean": 8.800596447144926e-06,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 1.2561434687086148e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 16375.0,
+      "completions/max_terminated_length": 16375.0,
+      "completions/mean_length": 7319.578125,
+      "completions/mean_terminated_length": 7319.578125,
+      "completions/min_length": 1974.0,
+      "completions/min_terminated_length": 1974.0,
+      "entropy": 0.9145128801465034,
+      "epoch": 0.33486660533578655,
+      "frac_reward_zero_std": 0.6875,
+      "grad_norm": 0.0010370119707658887,
+      "learning_rate": 1e-05,
+      "loss": 0.0138,
+      "num_tokens": 300608099.0,
+      "reward": 0.4609375,
+      "reward_std": 0.1412346363067627,
+      "rewards/accuracy_reward/mean": 0.4609375,
+      "rewards/accuracy_reward/std": 0.5004304051399231,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999158382415771,
+      "sampling/importance_sampling_ratio/min": 0.00012156071898061782,
+      "sampling/sampling_logp_difference/max": 9.015096664428711,
+      "sampling/sampling_logp_difference/mean": 0.019386455416679382,
+      "step": 364
+    },
+    {
+      "clip_ratio/high_max": 9.589830597178661e-06,
+      "clip_ratio/high_mean": 2.3974576492946653e-06,
+      "clip_ratio/low_mean": 2.2494899667435675e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.4892357714634272e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16316.0,
+      "completions/mean_length": 6956.90625,
+      "completions/mean_terminated_length": 6882.67724609375,
+      "completions/min_length": 769.0,
+      "completions/min_terminated_length": 769.0,
+      "entropy": 0.9679212644696236,
+      "epoch": 0.3357865685372585,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0021569218952208757,
+      "learning_rate": 1e-05,
+      "loss": 0.0621,
+      "num_tokens": 301516535.0,
+      "reward": 0.4765625,
+      "reward_std": 0.23462772369384766,
+      "rewards/accuracy_reward/mean": 0.4765625,
+      "rewards/accuracy_reward/std": 0.5014128684997559,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999587535858154,
+      "sampling/importance_sampling_ratio/min": 0.01621459797024727,
+      "sampling/sampling_logp_difference/max": 4.121843338012695,
+      "sampling/sampling_logp_difference/mean": 0.020638462156057358,
+      "step": 365
+    },
+    {
+      "clip_ratio/high_max": 1.1957331025769236e-05,
+      "clip_ratio/high_mean": 2.989332756442309e-06,
+      "clip_ratio/low_mean": 2.334770033485256e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.6337033204981708e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16343.0,
+      "completions/mean_length": 6933.1953125,
+      "completions/mean_terminated_length": 6706.37646484375,
+      "completions/min_length": 979.0,
+      "completions/min_terminated_length": 979.0,
+      "entropy": 0.9610472694039345,
+      "epoch": 0.33670653173873044,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0019900640472769737,
+      "learning_rate": 1e-05,
+      "loss": 0.0329,
+      "num_tokens": 302422120.0,
+      "reward": 0.4921875,
+      "reward_std": 0.22908620536327362,
+      "rewards/accuracy_reward/mean": 0.4921875,
+      "rewards/accuracy_reward/std": 0.5019033551216125,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999517202377319,
+      "sampling/importance_sampling_ratio/min": 7.346414143682978e-09,
+      "sampling/sampling_logp_difference/max": 18.729053497314453,
+      "sampling/sampling_logp_difference/mean": 0.020782412961125374,
+      "step": 366
+    },
+    {
+      "clip_ratio/high_max": 1.6365190958822495e-05,
+      "clip_ratio/high_mean": 4.091297739705624e-06,
+      "clip_ratio/low_mean": 2.5385876426753384e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.9477173825398495e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15799.0,
+      "completions/max_terminated_length": 15799.0,
+      "completions/mean_length": 6711.640625,
+      "completions/mean_terminated_length": 6711.640625,
+      "completions/min_length": 814.0,
+      "completions/min_terminated_length": 814.0,
+      "entropy": 0.8035724982619286,
+      "epoch": 0.3376264949402024,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.001954294042661786,
+      "learning_rate": 1e-05,
+      "loss": 0.0264,
+      "num_tokens": 303299402.0,
+      "reward": 0.4765625,
+      "reward_std": 0.2856517732143402,
+      "rewards/accuracy_reward/mean": 0.4765625,
+      "rewards/accuracy_reward/std": 0.5014128684997559,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000114440917969,
+      "sampling/importance_sampling_ratio/min": 0.002623806707561016,
+      "sampling/sampling_logp_difference/max": 5.943129062652588,
+      "sampling/sampling_logp_difference/mean": 0.018188728019595146,
+      "step": 367
+    },
+    {
+      "clip_ratio/high_max": 8.633360948806512e-06,
+      "clip_ratio/high_mean": 2.158340237201628e-06,
+      "clip_ratio/low_mean": 3.7187305906627444e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.9345645916455396e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15980.0,
+      "completions/mean_length": 6977.890625,
+      "completions/mean_terminated_length": 6674.4677734375,
+      "completions/min_length": 737.0,
+      "completions/min_terminated_length": 737.0,
+      "entropy": 0.9545647650957108,
+      "epoch": 0.33854645814167433,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.0022571857552975416,
+      "learning_rate": 1e-05,
+      "loss": 0.0187,
+      "num_tokens": 304210412.0,
+      "reward": 0.4375,
+      "reward_std": 0.19568344950675964,
+      "rewards/accuracy_reward/mean": 0.4375,
+      "rewards/accuracy_reward/std": 0.49802759289741516,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999645948410034,
+      "sampling/importance_sampling_ratio/min": 5.501153282239102e-06,
+      "sampling/sampling_logp_difference/max": 12.110552787780762,
+      "sampling/sampling_logp_difference/mean": 0.021196123212575912,
+      "step": 368
+    },
+    {
+      "clip_ratio/high_max": 1.2197504474897869e-05,
+      "clip_ratio/high_mean": 3.0493761187244672e-06,
+      "clip_ratio/low_mean": 2.7975384682576987e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.1024760801301454e-05,
+      "completions/clipped_ratio": 0.0546875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16087.0,
+      "completions/mean_length": 5952.8359375,
+      "completions/mean_terminated_length": 5349.3798828125,
+      "completions/min_length": 651.0,
+      "completions/min_terminated_length": 651.0,
+      "entropy": 0.846152663230896,
+      "epoch": 0.33946642134314625,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.003541936632245779,
+      "learning_rate": 1e-05,
+      "loss": 0.0897,
+      "num_tokens": 304989015.0,
+      "reward": 0.4453125,
+      "reward_std": 0.3022122383117676,
+      "rewards/accuracy_reward/mean": 0.4453125,
+      "rewards/accuracy_reward/std": 0.4989531338214874,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998842477798462,
+      "sampling/importance_sampling_ratio/min": 0.0019083521328866482,
+      "sampling/sampling_logp_difference/max": 6.261515140533447,
+      "sampling/sampling_logp_difference/mean": 0.018978029489517212,
+      "step": 369
+    },
+    {
+      "clip_ratio/high_max": 1.1725882586688385e-05,
+      "clip_ratio/high_mean": 2.9314706466720963e-06,
+      "clip_ratio/low_mean": 6.290217379500973e-05,
+      "clip_ratio/low_min": 1.226112590302364e-05,
+      "clip_ratio/region_mean": 6.583364438483841e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16098.0,
+      "completions/mean_length": 7976.9296875,
+      "completions/mean_terminated_length": 7635.1787109375,
+      "completions/min_length": 514.0,
+      "completions/min_terminated_length": 514.0,
+      "entropy": 0.9827005565166473,
+      "epoch": 0.3403863845446182,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0023713603150099516,
+      "learning_rate": 1e-05,
+      "loss": 0.0668,
+      "num_tokens": 306032054.0,
+      "reward": 0.3046875,
+      "reward_std": 0.2527809143066406,
+      "rewards/accuracy_reward/mean": 0.3046875,
+      "rewards/accuracy_reward/std": 0.46208351850509644,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000019073486328,
+      "sampling/importance_sampling_ratio/min": 3.2563195873080986e-07,
+      "sampling/sampling_logp_difference/max": 14.937498092651367,
+      "sampling/sampling_logp_difference/mean": 0.0217706598341465,
+      "step": 370
+    },
+    {
+      "clip_ratio/high_max": 2.3902987095425487e-05,
+      "clip_ratio/high_mean": 7.721868257704045e-06,
+      "clip_ratio/low_mean": 4.01184702241153e-05,
+      "clip_ratio/low_min": 1.341508686891757e-05,
+      "clip_ratio/region_mean": 4.784033922078379e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16369.0,
+      "completions/mean_length": 7117.8828125,
+      "completions/mean_terminated_length": 6895.49609375,
+      "completions/min_length": 1314.0,
+      "completions/min_terminated_length": 1314.0,
+      "entropy": 0.8897347301244736,
+      "epoch": 0.34130634774609014,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0023132229689508677,
+      "learning_rate": 1e-05,
+      "loss": 0.162,
+      "num_tokens": 306960599.0,
+      "reward": 0.515625,
+      "reward_std": 0.34822866320610046,
+      "rewards/accuracy_reward/mean": 0.515625,
+      "rewards/accuracy_reward/std": 0.5017194747924805,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999181032180786,
+      "sampling/importance_sampling_ratio/min": 0.0007341355667449534,
+      "sampling/sampling_logp_difference/max": 7.2168169021606445,
+      "sampling/sampling_logp_difference/mean": 0.018669119104743004,
+      "step": 371
+    },
+    {
+      "clip_ratio/high_max": 4.371240720502101e-06,
+      "clip_ratio/high_mean": 1.0928101801255252e-06,
+      "clip_ratio/low_mean": 4.9660218792269006e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.075302897239453e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15995.0,
+      "completions/mean_length": 6971.0390625,
+      "completions/mean_terminated_length": 6745.12841796875,
+      "completions/min_length": 871.0,
+      "completions/min_terminated_length": 871.0,
+      "entropy": 1.0919678956270218,
+      "epoch": 0.3422263109475621,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0030236958991736174,
+      "learning_rate": 1e-05,
+      "loss": 0.0754,
+      "num_tokens": 307873100.0,
+      "reward": 0.3359375,
+      "reward_std": 0.34245961904525757,
+      "rewards/accuracy_reward/mean": 0.3359375,
+      "rewards/accuracy_reward/std": 0.47417303919792175,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000091791152954,
+      "sampling/importance_sampling_ratio/min": 0.01082979142665863,
+      "sampling/sampling_logp_difference/max": 4.525454521179199,
+      "sampling/sampling_logp_difference/mean": 0.022024717181921005,
+      "step": 372
+    },
+    {
+      "clip_ratio/high_max": 4.341634394222638e-06,
+      "clip_ratio/high_mean": 1.0854085985556594e-06,
+      "clip_ratio/low_mean": 3.061858558339736e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.170399429563986e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14442.0,
+      "completions/mean_length": 7120.0,
+      "completions/mean_terminated_length": 6897.66455078125,
+      "completions/min_length": 1685.0,
+      "completions/min_terminated_length": 1685.0,
+      "entropy": 1.0812252908945084,
+      "epoch": 0.34314627414903404,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0018919071881100535,
+      "learning_rate": 1e-05,
+      "loss": 0.0542,
+      "num_tokens": 308804876.0,
+      "reward": 0.28125,
+      "reward_std": 0.2522490322589874,
+      "rewards/accuracy_reward/mean": 0.28125,
+      "rewards/accuracy_reward/std": 0.4513758420944214,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999696612358093,
+      "sampling/importance_sampling_ratio/min": 0.0011743507348001003,
+      "sampling/sampling_logp_difference/max": 6.747039794921875,
+      "sampling/sampling_logp_difference/mean": 0.022177904844284058,
+      "step": 373
+    },
+    {
+      "clip_ratio/high_max": 4.6198765630833805e-06,
+      "clip_ratio/high_mean": 1.1549691407708451e-06,
+      "clip_ratio/low_mean": 1.3996559573570266e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 1.5151528714341111e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15814.0,
+      "completions/mean_length": 7344.5546875,
+      "completions/mean_terminated_length": 6977.09716796875,
+      "completions/min_length": 127.0,
+      "completions/min_terminated_length": 127.0,
+      "entropy": 0.9340410158038139,
+      "epoch": 0.34406623735050595,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.001848200336098671,
+      "learning_rate": 1e-05,
+      "loss": 0.0195,
+      "num_tokens": 309762603.0,
+      "reward": 0.4296875,
+      "reward_std": 0.2188364714384079,
+      "rewards/accuracy_reward/mean": 0.4296875,
+      "rewards/accuracy_reward/std": 0.4969765841960907,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999948143959045,
+      "sampling/importance_sampling_ratio/min": 0.0002964614541269839,
+      "sampling/sampling_logp_difference/max": 8.1235933303833,
+      "sampling/sampling_logp_difference/mean": 0.02034556306898594,
+      "step": 374
+    },
+    {
+      "clip_ratio/high_max": 1.3913735983805964e-05,
+      "clip_ratio/high_mean": 3.478433995951491e-06,
+      "clip_ratio/low_mean": 2.4544106395296694e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.8022539936500834e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15244.0,
+      "completions/max_terminated_length": 15244.0,
+      "completions/mean_length": 6615.6484375,
+      "completions/mean_terminated_length": 6615.6484375,
+      "completions/min_length": 116.0,
+      "completions/min_terminated_length": 116.0,
+      "entropy": 0.971637412905693,
+      "epoch": 0.34498620055197793,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0012123315827921033,
+      "learning_rate": 1e-05,
+      "loss": 0.0581,
+      "num_tokens": 310628230.0,
+      "reward": 0.4296875,
+      "reward_std": 0.2948455810546875,
+      "rewards/accuracy_reward/mean": 0.4296875,
+      "rewards/accuracy_reward/std": 0.4969765841960907,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999868869781494,
+      "sampling/importance_sampling_ratio/min": 2.587145718280226e-05,
+      "sampling/sampling_logp_difference/max": 10.562370300292969,
+      "sampling/sampling_logp_difference/mean": 0.020877305418252945,
+      "step": 375
+    },
+    {
+      "clip_ratio/high_max": 6.119951194705209e-06,
+      "clip_ratio/high_mean": 1.5299877986763022e-06,
+      "clip_ratio/low_mean": 4.789722436271404e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.942721272982453e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16028.0,
+      "completions/mean_length": 6333.84375,
+      "completions/mean_terminated_length": 6009.64501953125,
+      "completions/min_length": 564.0,
+      "completions/min_terminated_length": 564.0,
+      "entropy": 0.9569023698568344,
+      "epoch": 0.34590616375344985,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.002646032487973571,
+      "learning_rate": 1e-05,
+      "loss": 0.086,
+      "num_tokens": 311457466.0,
+      "reward": 0.4453125,
+      "reward_std": 0.34928950667381287,
+      "rewards/accuracy_reward/mean": 0.4453125,
+      "rewards/accuracy_reward/std": 0.4989531338214874,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000247955322266,
+      "sampling/importance_sampling_ratio/min": 0.022760435938835144,
+      "sampling/sampling_logp_difference/max": 3.782731533050537,
+      "sampling/sampling_logp_difference/mean": 0.020464638248085976,
+      "step": 376
+    },
+    {
+      "clip_ratio/high_max": 1.8126566374121467e-05,
+      "clip_ratio/high_mean": 4.531641593530367e-06,
+      "clip_ratio/low_mean": 4.1024483266483e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.5556124632639694e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15719.0,
+      "completions/mean_length": 6657.8515625,
+      "completions/mean_terminated_length": 6503.46875,
+      "completions/min_length": 594.0,
+      "completions/min_terminated_length": 594.0,
+      "entropy": 1.029910758137703,
+      "epoch": 0.3468261269549218,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0021437006071209908,
+      "learning_rate": 1e-05,
+      "loss": -0.0212,
+      "num_tokens": 312330879.0,
+      "reward": 0.4453125,
+      "reward_std": 0.25354230403900146,
+      "rewards/accuracy_reward/mean": 0.4453125,
+      "rewards/accuracy_reward/std": 0.4989531338214874,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.000024437904358,
+      "sampling/importance_sampling_ratio/min": 0.020200612023472786,
+      "sampling/sampling_logp_difference/max": 3.9020423889160156,
+      "sampling/sampling_logp_difference/mean": 0.021411258727312088,
+      "step": 377
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 2.7961265118392475e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.7961265118392475e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16311.0,
+      "completions/mean_length": 7657.8359375,
+      "completions/mean_terminated_length": 7376.3466796875,
+      "completions/min_length": 741.0,
+      "completions/min_terminated_length": 741.0,
+      "entropy": 0.9699486121535301,
+      "epoch": 0.34774609015639374,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.0018965511117130518,
+      "learning_rate": 1e-05,
+      "loss": 0.066,
+      "num_tokens": 313331898.0,
+      "reward": 0.3515625,
+      "reward_std": 0.18884865939617157,
+      "rewards/accuracy_reward/mean": 0.3515625,
+      "rewards/accuracy_reward/std": 0.4793342351913452,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.000010371208191,
+      "sampling/importance_sampling_ratio/min": 7.867415661166888e-06,
+      "sampling/sampling_logp_difference/max": 11.75278091430664,
+      "sampling/sampling_logp_difference/mean": 0.021029409021139145,
+      "step": 378
+    },
+    {
+      "clip_ratio/high_max": 7.721664815107943e-06,
+      "clip_ratio/high_mean": 2.7168170504410227e-06,
+      "clip_ratio/low_mean": 4.313065619498957e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.584747375702136e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14619.0,
+      "completions/mean_length": 7085.3671875,
+      "completions/mean_terminated_length": 6937.77001953125,
+      "completions/min_length": 28.0,
+      "completions/min_terminated_length": 28.0,
+      "entropy": 1.0943557620048523,
+      "epoch": 0.3486660533578657,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0016498853219673038,
+      "learning_rate": 1e-05,
+      "loss": 0.0346,
+      "num_tokens": 314258601.0,
+      "reward": 0.3203125,
+      "reward_std": 0.24329257011413574,
+      "rewards/accuracy_reward/mean": 0.3203125,
+      "rewards/accuracy_reward/std": 0.4684300124645233,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.000105857849121,
+      "sampling/importance_sampling_ratio/min": 0.03447282314300537,
+      "sampling/sampling_logp_difference/max": 3.367583990097046,
+      "sampling/sampling_logp_difference/mean": 0.021414825692772865,
+      "step": 379
+    },
+    {
+      "clip_ratio/high_max": 7.953489330247976e-06,
+      "clip_ratio/high_mean": 1.988372332561994e-06,
+      "clip_ratio/low_mean": 3.479703536868328e-05,
+      "clip_ratio/low_min": 2.6767741019284585e-06,
+      "clip_ratio/region_mean": 3.6785407701245276e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15631.0,
+      "completions/mean_length": 7614.1171875,
+      "completions/mean_terminated_length": 7182.81103515625,
+      "completions/min_length": 511.0,
+      "completions/min_terminated_length": 511.0,
+      "entropy": 0.9673903658986092,
+      "epoch": 0.34958601655933763,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.001364902127534151,
+      "learning_rate": 1e-05,
+      "loss": 0.0041,
+      "num_tokens": 315256840.0,
+      "reward": 0.4296875,
+      "reward_std": 0.3503454327583313,
+      "rewards/accuracy_reward/mean": 0.4296875,
+      "rewards/accuracy_reward/std": 0.4969765841960907,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999970018863678,
+      "sampling/importance_sampling_ratio/min": 6.874255632283166e-05,
+      "sampling/sampling_logp_difference/max": 9.585142135620117,
+      "sampling/sampling_logp_difference/mean": 0.02000460773706436,
+      "step": 380
+    },
+    {
+      "clip_ratio/high_max": 6.980824764468707e-06,
+      "clip_ratio/high_mean": 1.7452061911171768e-06,
+      "clip_ratio/low_mean": 4.410173994529032e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.5846945681660145e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15637.0,
+      "completions/mean_length": 7766.9375,
+      "completions/mean_terminated_length": 7630.1591796875,
+      "completions/min_length": 57.0,
+      "completions/min_terminated_length": 57.0,
+      "entropy": 1.0277370810508728,
+      "epoch": 0.35050597976080955,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.002171436557546258,
+      "learning_rate": 1e-05,
+      "loss": 0.0705,
+      "num_tokens": 316268976.0,
+      "reward": 0.34375,
+      "reward_std": 0.24435341358184814,
+      "rewards/accuracy_reward/mean": 0.34375,
+      "rewards/accuracy_reward/std": 0.47682511806488037,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999735951423645,
+      "sampling/importance_sampling_ratio/min": 7.485197420464829e-05,
+      "sampling/sampling_logp_difference/max": 9.499998092651367,
+      "sampling/sampling_logp_difference/mean": 0.021251089870929718,
+      "step": 381
+    },
+    {
+      "clip_ratio/high_max": 9.843256520980503e-06,
+      "clip_ratio/high_mean": 3.5061395919910865e-06,
+      "clip_ratio/low_mean": 3.973216325903195e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.323830307839671e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15752.0,
+      "completions/mean_length": 7774.2265625,
+      "completions/mean_terminated_length": 7567.59228515625,
+      "completions/min_length": 595.0,
+      "completions/min_terminated_length": 595.0,
+      "entropy": 1.0064171329140663,
+      "epoch": 0.3514259429622815,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0013348929351195693,
+      "learning_rate": 1e-05,
+      "loss": 0.0336,
+      "num_tokens": 317285677.0,
+      "reward": 0.28125,
+      "reward_std": 0.23934084177017212,
+      "rewards/accuracy_reward/mean": 0.28125,
+      "rewards/accuracy_reward/std": 0.4513758420944214,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999904632568359,
+      "sampling/importance_sampling_ratio/min": 1.7632934259381727e-06,
+      "sampling/sampling_logp_difference/max": 13.248327255249023,
+      "sampling/sampling_logp_difference/mean": 0.022232960909605026,
+      "step": 382
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 3.2021426648043416e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.2021426648043416e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16208.0,
+      "completions/mean_length": 6547.1796875,
+      "completions/mean_terminated_length": 6469.724609375,
+      "completions/min_length": 894.0,
+      "completions/min_terminated_length": 894.0,
+      "entropy": 0.9192209765315056,
+      "epoch": 0.35234590616375344,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.002925361506640911,
+      "learning_rate": 1e-05,
+      "loss": 0.0809,
+      "num_tokens": 318148276.0,
+      "reward": 0.515625,
+      "reward_std": 0.24435341358184814,
+      "rewards/accuracy_reward/mean": 0.515625,
+      "rewards/accuracy_reward/std": 0.5017194747924805,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999691843986511,
+      "sampling/importance_sampling_ratio/min": 7.411971182591515e-06,
+      "sampling/sampling_logp_difference/max": 11.812414169311523,
+      "sampling/sampling_logp_difference/mean": 0.020470617339015007,
+      "step": 383
+    },
+    {
+      "clip_ratio/high_max": 1.543848429719219e-05,
+      "clip_ratio/high_mean": 3.8596210742980475e-06,
+      "clip_ratio/low_mean": 2.0332364726982632e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.4191985573907004e-05,
+      "completions/clipped_ratio": 0.078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15923.0,
+      "completions/mean_length": 6904.40625,
+      "completions/mean_terminated_length": 6101.05078125,
+      "completions/min_length": 964.0,
+      "completions/min_terminated_length": 964.0,
+      "entropy": 0.9611739367246628,
+      "epoch": 0.3532658693652254,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.002288331277668476,
+      "learning_rate": 1e-05,
+      "loss": 0.0388,
+      "num_tokens": 319052224.0,
+      "reward": 0.390625,
+      "reward_std": 0.23645779490470886,
+      "rewards/accuracy_reward/mean": 0.390625,
+      "rewards/accuracy_reward/std": 0.4898075461387634,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999583959579468,
+      "sampling/importance_sampling_ratio/min": 1.0906596799031831e-05,
+      "sampling/sampling_logp_difference/max": 11.426142692565918,
+      "sampling/sampling_logp_difference/mean": 0.02049478143453598,
+      "step": 384
+    }
+  ],
+  "logging_steps": 1,
+  "max_steps": 1024,
+  "num_input_tokens_seen": 319052224,
+  "num_train_epochs": 1,
+  "save_steps": 64,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 0.0,
+  "train_batch_size": 4,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/dapo_lora_plus_20251202_001141/checkpoint-384/zero_to_fp32.py b/dapo_lora_plus_20251202_001141/checkpoint-384/zero_to_fp32.py
new file mode 100644
index 0000000000000000000000000000000000000000..5995d6e6f04e43b989587aa9022a3aef0c66d694
--- /dev/null
+++ b/dapo_lora_plus_20251202_001141/checkpoint-384/zero_to_fp32.py
@@ -0,0 +1,760 @@
+#!/usr/bin/env python
+
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets
+# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in
+# the future. Once extracted, the weights don't require DeepSpeed and can be used in any
+# application.
+#
+# example:
+#   python zero_to_fp32.py . output_dir/
+#   or
+#   python zero_to_fp32.py . output_dir/ --safe_serialization
+
+import argparse
+import torch
+import glob
+import math
+import os
+import re
+import gc
+import json
+import numpy as np
+from tqdm import tqdm
+from collections import OrderedDict
+from dataclasses import dataclass
+
+# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with
+# DeepSpeed data structures it has to be available in the current python environment.
+from deepspeed.utils import logger
+from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS,
+                                            FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES,
+                                            FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS)
+
+
+@dataclass
+class zero_model_state:
+    buffers: dict()
+    param_shapes: dict()
+    shared_params: list
+    ds_version: int
+    frozen_param_shapes: dict()
+    frozen_param_fragments: dict()
+
+
+debug = 0
+
+# load to cpu
+device = torch.device('cpu')
+
+
+def atoi(text):
+    return int(text) if text.isdigit() else text
+
+
+def natural_keys(text):
+    '''
+    alist.sort(key=natural_keys) sorts in human order
+    http://nedbatchelder.com/blog/200712/human_sorting.html
+    (See Toothy's implementation in the comments)
+    '''
+    return [atoi(c) for c in re.split(r'(\d+)', text)]
+
+
+def get_model_state_file(checkpoint_dir, zero_stage):
+    if not os.path.isdir(checkpoint_dir):
+        raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist")
+
+    # there should be only one file
+    if zero_stage <= 2:
+        file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt")
+    elif zero_stage == 3:
+        file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt")
+
+    if not os.path.exists(file):
+        raise FileNotFoundError(f"can't find model states file at '{file}'")
+
+    return file
+
+
+def get_checkpoint_files(checkpoint_dir, glob_pattern):
+    # XXX: need to test that this simple glob rule works for multi-node setup too
+    ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys)
+
+    if len(ckpt_files) == 0:
+        raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'")
+
+    return ckpt_files
+
+
+def get_optim_files(checkpoint_dir):
+    return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt")
+
+
+def get_model_state_files(checkpoint_dir):
+    return get_checkpoint_files(checkpoint_dir, "*_model_states.pt")
+
+
+def parse_model_states(files):
+    zero_model_states = []
+    for file in files:
+        state_dict = torch.load(file, map_location=device, weights_only=False)
+
+        if BUFFER_NAMES not in state_dict:
+            raise ValueError(f"{file} is not a model state checkpoint")
+        buffer_names = state_dict[BUFFER_NAMES]
+        if debug:
+            print("Found buffers:", buffer_names)
+
+        # recover just the buffers while restoring them to fp32 if they were saved in fp16
+        buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names}
+        param_shapes = state_dict[PARAM_SHAPES]
+
+        # collect parameters that are included in param_shapes
+        param_names = []
+        for s in param_shapes:
+            for name in s.keys():
+                param_names.append(name)
+
+        # update with frozen parameters
+        frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None)
+        if frozen_param_shapes is not None:
+            if debug:
+                print(f"Found frozen_param_shapes: {frozen_param_shapes}")
+            param_names += list(frozen_param_shapes.keys())
+
+        # handle shared params
+        shared_params = [[k, v] for k, v in state_dict["shared_params"].items()]
+
+        ds_version = state_dict.get(DS_VERSION, None)
+
+        frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None)
+
+        z_model_state = zero_model_state(buffers=buffers,
+                                         param_shapes=param_shapes,
+                                         shared_params=shared_params,
+                                         ds_version=ds_version,
+                                         frozen_param_shapes=frozen_param_shapes,
+                                         frozen_param_fragments=frozen_param_fragments)
+        zero_model_states.append(z_model_state)
+
+    return zero_model_states
+
+
+def parse_optim_states(files, ds_checkpoint_dir):
+    total_files = len(files)
+    state_dicts = []
+    for f in tqdm(files, desc='Loading checkpoint shards'):
+        state_dict = torch.load(f, map_location=device, mmap=True, weights_only=False)
+        # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights
+        # and also handle the case where it was already removed by another helper script
+        state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None)
+        state_dicts.append(state_dict)
+
+    if ZERO_STAGE not in state_dicts[0][OPTIMIZER_STATE_DICT]:
+        raise ValueError(f"{files[0]} is not a zero checkpoint")
+    zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE]
+    world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT]
+
+    # For ZeRO-2 each param group can have different partition_count as data parallelism for expert
+    # parameters can be different from data parallelism for non-expert parameters. So we can just
+    # use the max of the partition_count to get the dp world_size.
+
+    if type(world_size) is list:
+        world_size = max(world_size)
+
+    if world_size != total_files:
+        raise ValueError(
+            f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. "
+            "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes."
+        )
+
+    # the groups are named differently in each stage
+    if zero_stage <= 2:
+        fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS
+    elif zero_stage == 3:
+        fp32_groups_key = FP32_FLAT_GROUPS
+    else:
+        raise ValueError(f"unknown zero stage {zero_stage}")
+
+    fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))]
+    return zero_stage, world_size, fp32_flat_groups
+
+
+def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters):
+    """
+    Returns fp32 state_dict reconstructed from ds checkpoint
+
+    Args:
+        - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are)
+
+    """
+    print(f"Processing zero checkpoint '{ds_checkpoint_dir}'")
+
+    optim_files = get_optim_files(ds_checkpoint_dir)
+    zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir)
+    print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}")
+
+    model_files = get_model_state_files(ds_checkpoint_dir)
+
+    zero_model_states = parse_model_states(model_files)
+    print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}')
+
+    if zero_stage <= 2:
+        return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+                                                          exclude_frozen_parameters)
+    elif zero_stage == 3:
+        return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+                                                          exclude_frozen_parameters)
+
+
+def _zero2_merge_frozen_params(state_dict, zero_model_states):
+    if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0:
+        return
+
+    frozen_param_shapes = zero_model_states[0].frozen_param_shapes
+    frozen_param_fragments = zero_model_states[0].frozen_param_fragments
+
+    if debug:
+        num_elem = sum(s.numel() for s in frozen_param_shapes.values())
+        print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}')
+
+        wanted_params = len(frozen_param_shapes)
+        wanted_numel = sum(s.numel() for s in frozen_param_shapes.values())
+        avail_numel = sum([p.numel() for p in frozen_param_fragments.values()])
+        print(f'Frozen params: Have {avail_numel} numels to process.')
+        print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params')
+
+    total_params = 0
+    total_numel = 0
+    for name, shape in frozen_param_shapes.items():
+        total_params += 1
+        unpartitioned_numel = shape.numel()
+        total_numel += unpartitioned_numel
+
+        state_dict[name] = frozen_param_fragments[name]
+
+        if debug:
+            print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ")
+
+    print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements")
+
+
+def _has_callable(obj, fn):
+    attr = getattr(obj, fn, None)
+    return callable(attr)
+
+
+def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states):
+    param_shapes = zero_model_states[0].param_shapes
+
+    # Reconstruction protocol:
+    #
+    # XXX: document this
+
+    if debug:
+        for i in range(world_size):
+            for j in range(len(fp32_flat_groups[0])):
+                print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}")
+
+    # XXX: memory usage doubles here (zero2)
+    num_param_groups = len(fp32_flat_groups[0])
+    merged_single_partition_of_fp32_groups = []
+    for i in range(num_param_groups):
+        merged_partitions = [sd[i] for sd in fp32_flat_groups]
+        full_single_fp32_vector = torch.cat(merged_partitions, 0)
+        merged_single_partition_of_fp32_groups.append(full_single_fp32_vector)
+    avail_numel = sum(
+        [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups])
+
+    if debug:
+        wanted_params = sum([len(shapes) for shapes in param_shapes])
+        wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes])
+        # not asserting if there is a mismatch due to possible padding
+        print(f"Have {avail_numel} numels to process.")
+        print(f"Need {wanted_numel} numels in {wanted_params} params.")
+
+    # params
+    # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support
+    # out-of-core computing solution
+    total_numel = 0
+    total_params = 0
+    for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups):
+        offset = 0
+        avail_numel = full_single_fp32_vector.numel()
+        for name, shape in shapes.items():
+
+            unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape)
+            total_numel += unpartitioned_numel
+            total_params += 1
+
+            if debug:
+                print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ")
+            state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape)
+            offset += unpartitioned_numel
+
+        # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and
+        # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex
+        # paddings performed in the code it's almost impossible to predict the exact numbers w/o the
+        # live optimizer object, so we are checking that the numbers are within the right range
+        align_to = 2 * world_size
+
+        def zero2_align(x):
+            return align_to * math.ceil(x / align_to)
+
+        if debug:
+            print(f"original offset={offset}, avail_numel={avail_numel}")
+
+        offset = zero2_align(offset)
+        avail_numel = zero2_align(avail_numel)
+
+        if debug:
+            print(f"aligned  offset={offset}, avail_numel={avail_numel}")
+
+        # Sanity check
+        if offset != avail_numel:
+            raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong")
+
+    print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements")
+
+
+def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+                                               exclude_frozen_parameters):
+    state_dict = OrderedDict()
+
+    # buffers
+    buffers = zero_model_states[0].buffers
+    state_dict.update(buffers)
+    if debug:
+        print(f"added {len(buffers)} buffers")
+
+    if not exclude_frozen_parameters:
+        _zero2_merge_frozen_params(state_dict, zero_model_states)
+
+    _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states)
+
+    # recover shared parameters
+    for pair in zero_model_states[0].shared_params:
+        if pair[1] in state_dict:
+            state_dict[pair[0]] = state_dict[pair[1]]
+
+    return state_dict
+
+
+def zero3_partitioned_param_info(unpartitioned_numel, world_size):
+    remainder = unpartitioned_numel % world_size
+    padding_numel = (world_size - remainder) if remainder else 0
+    partitioned_numel = math.ceil(unpartitioned_numel / world_size)
+    return partitioned_numel, padding_numel
+
+
+def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states):
+    if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0:
+        return
+
+    if debug:
+        for i in range(world_size):
+            num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values())
+            print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}')
+
+        frozen_param_shapes = zero_model_states[0].frozen_param_shapes
+        wanted_params = len(frozen_param_shapes)
+        wanted_numel = sum(s.numel() for s in frozen_param_shapes.values())
+        avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size
+        print(f'Frozen params: Have {avail_numel} numels to process.')
+        print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params')
+
+    total_params = 0
+    total_numel = 0
+    for name, shape in zero_model_states[0].frozen_param_shapes.items():
+        total_params += 1
+        unpartitioned_numel = shape.numel()
+        total_numel += unpartitioned_numel
+
+        param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states)
+        state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape)
+
+        partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size)
+
+        if debug:
+            print(
+                f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}"
+            )
+
+    print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements")
+
+
+class GatheredTensor:
+    """
+    A pseudo tensor that collects partitioned weights.
+    It is more memory efficient when there are multiple groups.
+    """
+
+    def __init__(self, flat_groups, flat_groups_offset, offset, partitioned_numel, shape):
+        self.flat_groups = flat_groups
+        self.flat_groups_offset = flat_groups_offset
+        self.offset = offset
+        self.partitioned_numel = partitioned_numel
+        self.shape = shape
+        self.dtype = self.flat_groups[0][0].dtype
+
+    def contiguous(self):
+        """
+        Merge partitioned weights from flat_groups into a single tensor.
+        """
+        end_idx = self.offset + self.partitioned_numel
+        world_size = len(self.flat_groups)
+        pad_flat_param_chunks = []
+
+        for rank_i in range(world_size):
+            # for each rank, we need to collect weights from related group/groups
+            flat_groups_at_rank_i = self.flat_groups[rank_i]
+            start_group_id = None
+            end_group_id = None
+            for group_id in range(len(self.flat_groups_offset)):
+                if self.flat_groups_offset[group_id] <= self.offset < self.flat_groups_offset[group_id + 1]:
+                    start_group_id = group_id
+                if self.flat_groups_offset[group_id] < end_idx <= self.flat_groups_offset[group_id + 1]:
+                    end_group_id = group_id
+                    break
+            # collect weights from related group/groups
+            for group_id in range(start_group_id, end_group_id + 1):
+                flat_tensor = flat_groups_at_rank_i[group_id]
+                start_offset = self.offset - self.flat_groups_offset[group_id]
+                end_offset = min(end_idx, self.flat_groups_offset[group_id + 1]) - self.flat_groups_offset[group_id]
+                pad_flat_param_chunks.append(flat_tensor[start_offset:end_offset])
+
+        # collect weights from all ranks
+        pad_flat_param = torch.cat(pad_flat_param_chunks, dim=0)
+        param = pad_flat_param[:self.shape.numel()].view(self.shape).contiguous()
+        return param
+
+
+def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states):
+    param_shapes = zero_model_states[0].param_shapes
+    avail_numel = sum([flat_group.numel() for flat_group in fp32_flat_groups[0]]) * world_size
+
+    # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each
+    # param, re-consolidating each param, while dealing with padding if any
+
+    # merge list of dicts, preserving order
+    param_shapes = {k: v for d in param_shapes for k, v in d.items()}
+
+    if debug:
+        for i in range(world_size):
+            print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}")
+
+        wanted_params = len(param_shapes)
+        wanted_numel = sum(shape.numel() for shape in param_shapes.values())
+        # not asserting if there is a mismatch due to possible padding
+        avail_numel = fp32_flat_groups[0].numel() * world_size
+        print(f"Trainable params: Have {avail_numel} numels to process.")
+        print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.")
+
+    # params
+    # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support
+    # out-of-core computing solution
+    offset = 0
+    total_numel = 0
+    total_params = 0
+    flat_groups_offset = [0] + list(np.cumsum([flat_tensor.numel() for flat_tensor in fp32_flat_groups[0]]))
+    for name, shape in tqdm(param_shapes.items(), desc='Gathering sharded weights'):
+        unpartitioned_numel = shape.numel()
+        total_numel += unpartitioned_numel
+        total_params += 1
+        partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size)
+
+        if debug:
+            print(
+                f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}"
+            )
+
+        # memory efficient tensor
+        tensor = GatheredTensor(fp32_flat_groups, flat_groups_offset, offset, partitioned_numel, shape)
+        state_dict[name] = tensor
+        offset += partitioned_numel
+
+    offset *= world_size
+
+    # Sanity check
+    if offset != avail_numel:
+        raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong")
+
+    print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements")
+
+
+def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+                                               exclude_frozen_parameters):
+    state_dict = OrderedDict()
+
+    # buffers
+    buffers = zero_model_states[0].buffers
+    state_dict.update(buffers)
+    if debug:
+        print(f"added {len(buffers)} buffers")
+
+    if not exclude_frozen_parameters:
+        _zero3_merge_frozen_params(state_dict, world_size, zero_model_states)
+
+    _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states)
+
+    # recover shared parameters
+    for pair in zero_model_states[0].shared_params:
+        if pair[1] in state_dict:
+            state_dict[pair[0]] = state_dict[pair[1]]
+
+    return state_dict
+
+
+def to_torch_tensor(state_dict, return_empty_tensor=False):
+    """
+    Convert state_dict of GatheredTensor to torch tensor
+    """
+    torch_state_dict = {}
+    converted_tensors = {}
+    for name, tensor in state_dict.items():
+        tensor_id = id(tensor)
+        if tensor_id in converted_tensors:  # shared tensors
+            shared_tensor = torch_state_dict[converted_tensors[tensor_id]]
+            torch_state_dict[name] = shared_tensor
+        else:
+            converted_tensors[tensor_id] = name
+            if return_empty_tensor:
+                torch_state_dict[name] = torch.empty(tensor.shape, dtype=tensor.dtype)
+            else:
+                torch_state_dict[name] = tensor.contiguous()
+    return torch_state_dict
+
+
+def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir,
+                                             tag=None,
+                                             exclude_frozen_parameters=False,
+                                             lazy_mode=False):
+    """
+    Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with
+    ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example
+    via a model hub.
+
+    Args:
+        - ``checkpoint_dir``: path to the desired checkpoint folder
+        - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14``
+        - ``exclude_frozen_parameters``: exclude frozen parameters
+        - ``lazy_mode``: get state_dict in lazy mode. It returns a dict of pesduo tensor instead of torch tensor, which is more memory efficient.
+          Convert the pesduo tensor to torch tensor by ``.contiguous()``
+
+    Returns:
+        - pytorch ``state_dict``
+
+    A typical usage might be ::
+
+        from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint
+        # do the training and checkpoint saving
+        state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu
+        model = model.cpu() # move to cpu
+        model.load_state_dict(state_dict)
+        # submit to model hub or save the model to share with others
+
+    In this example the ``model`` will no longer be usable in the deepspeed context of the same
+    application. i.e. you will need to re-initialize the deepspeed engine, since
+    ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it.
+
+    If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead.
+
+    Note: the above usage may not work if your application doesn't have sufficient free CPU memory.
+    You may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with
+    the checkpoint. Or you can load state_dict in lazy mode ::
+
+        from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint
+        state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, lazy_mode=True) # not on cpu
+        for name, lazy_tensor in state_dict.item():
+            tensor = lazy_tensor.contiguous()  # to cpu
+            print(name, tensor)
+            # del tensor to release memory if it no longer in use
+    """
+    if tag is None:
+        latest_path = os.path.join(checkpoint_dir, 'latest')
+        if os.path.isfile(latest_path):
+            with open(latest_path, 'r') as fd:
+                tag = fd.read().strip()
+        else:
+            raise ValueError(f"Unable to find 'latest' file at {latest_path}")
+
+    ds_checkpoint_dir = os.path.join(checkpoint_dir, tag)
+
+    if not os.path.isdir(ds_checkpoint_dir):
+        raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist")
+
+    state_dict = _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters)
+    if lazy_mode:
+        return state_dict
+    else:
+        return to_torch_tensor(state_dict)
+
+
+def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir,
+                                               output_dir,
+                                               max_shard_size="5GB",
+                                               safe_serialization=False,
+                                               tag=None,
+                                               exclude_frozen_parameters=False):
+    """
+    Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be
+    loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed.
+
+    Args:
+        - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``)
+        - ``output_dir``: directory to the pytorch fp32 state_dict output files
+        - ``max_shard_size``: the maximum size for a checkpoint before being sharded, default value is 5GB
+        - ``safe_serialization``:  whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).
+        - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14``
+        - ``exclude_frozen_parameters``: exclude frozen parameters
+    """
+
+    # Dependency pre-check
+    if safe_serialization:
+        try:
+            from safetensors.torch import save_file
+        except ImportError:
+            print('If you want to use `safe_serialization`, please `pip install safetensors`')
+            raise
+    if max_shard_size is not None:
+        try:
+            from huggingface_hub import split_torch_state_dict_into_shards
+        except ImportError:
+            print('If you want to use `max_shard_size`, please `pip install huggingface_hub`')
+            raise
+
+    # Convert zero checkpoint to state_dict
+    state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir,
+                                                          tag,
+                                                          exclude_frozen_parameters,
+                                                          lazy_mode=True)
+
+    # Shard the model if it is too big.
+    weights_name = "model.safetensors" if safe_serialization else "pytorch_model.bin"
+    if max_shard_size is not None:
+        filename_pattern = weights_name.replace(".bin", "{suffix}.bin").replace(".safetensors", "{suffix}.safetensors")
+        # an memory-efficient approach for sharding
+        empty_state_dict = to_torch_tensor(state_dict, return_empty_tensor=True)
+        state_dict_split = split_torch_state_dict_into_shards(empty_state_dict,
+                                                              filename_pattern=filename_pattern,
+                                                              max_shard_size=max_shard_size)
+    else:
+        from collections import namedtuple
+        StateDictSplit = namedtuple("StateDictSplit", ["is_sharded", "filename_to_tensors"])
+        state_dict_split = StateDictSplit(is_sharded=False,
+                                          filename_to_tensors={weights_name: list(state_dict.keys())})
+
+    # Save the model by shard
+    os.makedirs(output_dir, exist_ok=True)
+    filename_to_tensors = state_dict_split.filename_to_tensors.items()
+    for shard_file, tensors in tqdm(filename_to_tensors, desc="Saving checkpoint shards"):
+        shard_state_dict = {tensor_name: state_dict[tensor_name] for tensor_name in tensors}
+        shard_state_dict = to_torch_tensor(shard_state_dict)
+        output_path = os.path.join(output_dir, shard_file)
+        if safe_serialization:
+            save_file(shard_state_dict, output_path, metadata={"format": "pt"})
+        else:
+            torch.save(shard_state_dict, output_path)
+        # release the memory of current shard
+        for tensor_name in list(shard_state_dict.keys()):
+            del state_dict[tensor_name]
+            del shard_state_dict[tensor_name]
+        del shard_state_dict
+        gc.collect()
+
+    # Save index if sharded
+    if state_dict_split.is_sharded:
+        index = {
+            "metadata": state_dict_split.metadata,
+            "weight_map": state_dict_split.tensor_to_filename,
+        }
+        save_index_file = "model.safetensors.index.json" if safe_serialization else "pytorch_model.bin.index.json"
+        save_index_file = os.path.join(output_dir, save_index_file)
+        with open(save_index_file, "w", encoding="utf-8") as f:
+            content = json.dumps(index, indent=2, sort_keys=True) + "\n"
+            f.write(content)
+
+
+def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None):
+    """
+    1. Put the provided model to cpu
+    2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict``
+    3. Load it into the provided model
+
+    Args:
+        - ``model``: the model object to update
+        - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``)
+        - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14``
+
+    Returns:
+        - ``model`: modified model
+
+    Make sure you have plenty of CPU memory available before you call this function. If you don't
+    have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it
+    conveniently placed for you in the checkpoint folder.
+
+    A typical usage might be ::
+
+        from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint
+        model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir)
+        # submit to model hub or save the model to share with others
+
+    Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context
+    of the same application. i.e. you will need to re-initialize the deepspeed engine, since
+    ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it.
+
+    """
+    logger.info("Extracting fp32 weights")
+    state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag)
+
+    logger.info("Overwriting model with fp32 weights")
+    model = model.cpu()
+    model.load_state_dict(state_dict, strict=False)
+
+    return model
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("checkpoint_dir",
+                        type=str,
+                        help="path to the desired checkpoint folder, e.g., path/checkpoint-12")
+    parser.add_argument("output_dir",
+                        type=str,
+                        help="directory to the pytorch fp32 state_dict output files"
+                        "(e.g. path/checkpoint-12-output/)")
+    parser.add_argument(
+        "--max_shard_size",
+        type=str,
+        default="5GB",
+        help="The maximum size for a checkpoint before being sharded. Checkpoints shard will then be each of size"
+        "lower than this size. If expressed as a string, needs to be digits followed by a unit (like `5MB`"
+        "We default it to 5GB in order for models to be able to run easily on free-tier google colab instances"
+        "without CPU OOM issues.")
+    parser.add_argument(
+        "--safe_serialization",
+        default=False,
+        action='store_true',
+        help="Whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).")
+    parser.add_argument("-t",
+                        "--tag",
+                        type=str,
+                        default=None,
+                        help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1")
+    parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters")
+    parser.add_argument("-d", "--debug", action='store_true', help="enable debug")
+    args = parser.parse_args()
+
+    debug = args.debug
+
+    convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir,
+                                               args.output_dir,
+                                               max_shard_size=args.max_shard_size,
+                                               safe_serialization=args.safe_serialization,
+                                               tag=args.tag,
+                                               exclude_frozen_parameters=args.exclude_frozen_parameters)
diff --git a/dapo_lora_plus_20251202_001141/checkpoint-448/README.md b/dapo_lora_plus_20251202_001141/checkpoint-448/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..b3fac4aca7a7fabb3a0972e6c9281e23853e2816
--- /dev/null
+++ b/dapo_lora_plus_20251202_001141/checkpoint-448/README.md
@@ -0,0 +1,209 @@
+---
+base_model: deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B
+- grpo
+- lora
+- transformers
+- trl
+---
+
+# Model Card for Model ID
+
+<!-- Provide a quick summary of what the model is/does. -->
+
+
+
+## Model Details
+
+### Model Description
+
+<!-- Provide a longer summary of what this model is. -->
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+<!-- Provide the basic links for the model. -->
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+
+### Direct Use
+
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+
+[More Information Needed]
+
+### Recommendations
+
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+
+[More Information Needed]
+
+### Training Procedure
+
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+
+#### Speeds, Sizes, Times [optional]
+
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+
+[More Information Needed]
+
+## Evaluation
+
+<!-- This section describes the evaluation protocols and provides the results. -->
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+<!-- This should link to a Dataset Card if possible. -->
+
+[More Information Needed]
+
+#### Factors
+
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+
+[More Information Needed]
+
+#### Metrics
+
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+<!-- Relevant interpretability work for the model goes here -->
+
+[More Information Needed]
+
+## Environmental Impact
+
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.17.1
\ No newline at end of file
diff --git a/dapo_lora_plus_20251202_001141/checkpoint-448/adapter_config.json b/dapo_lora_plus_20251202_001141/checkpoint-448/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..57b1340e85011632bb78b2fd3b13b455f6b0d622
--- /dev/null
+++ b/dapo_lora_plus_20251202_001141/checkpoint-448/adapter_config.json
@@ -0,0 +1,42 @@
+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B",
+  "bias": "none",
+  "corda_config": null,
+  "eva_config": null,
+  "exclude_modules": null,
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 64,
+  "lora_bias": false,
+  "lora_dropout": 0.05,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "qalora_group_size": 16,
+  "r": 32,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "q_proj",
+    "v_proj",
+    "k_proj",
+    "gate_proj",
+    "down_proj",
+    "up_proj",
+    "o_proj"
+  ],
+  "target_parameters": null,
+  "task_type": "CAUSAL_LM",
+  "trainable_token_indices": null,
+  "use_dora": false,
+  "use_qalora": false,
+  "use_rslora": false
+}
\ No newline at end of file
diff --git a/dapo_lora_plus_20251202_001141/checkpoint-448/chat_template.jinja b/dapo_lora_plus_20251202_001141/checkpoint-448/chat_template.jinja
new file mode 100644
index 0000000000000000000000000000000000000000..c2066bd7391c270626e39c9d7124f00360126412
--- /dev/null
+++ b/dapo_lora_plus_20251202_001141/checkpoint-448/chat_template.jinja
@@ -0,0 +1 @@
+{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% set ns = namespace(is_first=false, is_tool=false, is_output_first=true, system_prompt='') %}{%- for message in messages %}{%- if message['role'] == 'system' %}{% set ns.system_prompt = message['content'] %}{%- endif %}{%- endfor %}{{bos_token}}{{ns.system_prompt}}{%- for message in messages %}{%- if message['role'] == 'user' %}{%- set ns.is_tool = false -%}{{'<｜User｜>' + message['content']}}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is none %}{%- set ns.is_tool = false -%}{%- for tool in message['tool_calls']%}{%- if not ns.is_first %}{{'<｜Assistant｜><｜tool▁calls▁begin｜><｜tool▁call▁begin｜>' + tool['type'] + '<｜tool▁sep｜>' + tool['function']['name'] + '\n' + '```json' + '\n' + tool['function']['arguments'] + '\n' + '```' + '<｜tool▁call▁end｜>'}}{%- set ns.is_first = true -%}{%- else %}{{'\n' + '<｜tool▁call▁begin｜>' + tool['type'] + '<｜tool▁sep｜>' + tool['function']['name'] + '\n' + '```json' + '\n' + tool['function']['arguments'] + '\n' + '```' + '<｜tool▁call▁end｜>'}}{{'<｜tool▁calls▁end｜><｜end▁of▁sentence｜>'}}{%- endif %}{%- endfor %}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is not none %}{%- if ns.is_tool %}{{'<｜tool▁outputs▁end｜>' + message['content'] + '<｜end▁of▁sentence｜>'}}{%- set ns.is_tool = false -%}{%- else %}{% set content = message['content'] %}{% if '</think>' in content %}{% set content = content.split('</think>')[-1] %}{% endif %}{{'<｜Assistant｜>' + content + '<｜end▁of▁sentence｜>'}}{%- endif %}{%- endif %}{%- if message['role'] == 'tool' %}{%- set ns.is_tool = true -%}{%- if ns.is_output_first %}{{'<｜tool▁outputs▁begin｜><｜tool▁output▁begin｜>' + message['content'] + '<｜tool▁output▁end｜>'}}{%- set ns.is_output_first = false %}{%- else %}{{'\n<｜tool▁output▁begin｜>' + message['content'] + '<｜tool▁output▁end｜>'}}{%- endif %}{%- endif %}{%- endfor -%}{% if ns.is_tool %}{{'<｜tool▁outputs▁end｜>'}}{% endif %}{% if add_generation_prompt and not ns.is_tool %}{{'<｜Assistant｜><think>\n'}}{% endif %}
\ No newline at end of file
diff --git a/dapo_lora_plus_20251202_001141/checkpoint-448/latest b/dapo_lora_plus_20251202_001141/checkpoint-448/latest
new file mode 100644
index 0000000000000000000000000000000000000000..6c83691d1f18f1aa59c0994e76f1e0d010c88273
--- /dev/null
+++ b/dapo_lora_plus_20251202_001141/checkpoint-448/latest
@@ -0,0 +1 @@
+global_step448
\ No newline at end of file
diff --git a/dapo_lora_plus_20251202_001141/checkpoint-448/special_tokens_map.json b/dapo_lora_plus_20251202_001141/checkpoint-448/special_tokens_map.json
new file mode 100644
index 0000000000000000000000000000000000000000..1d385d62cf08bca35254547902b792c243656ec1
--- /dev/null
+++ b/dapo_lora_plus_20251202_001141/checkpoint-448/special_tokens_map.json
@@ -0,0 +1,23 @@
+{
+  "bos_token": {
+    "content": "<｜begin▁of▁sentence｜>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "<｜end▁of▁sentence｜>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<｜end▁of▁sentence｜>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}
diff --git a/dapo_lora_plus_20251202_001141/checkpoint-448/tokenizer_config.json b/dapo_lora_plus_20251202_001141/checkpoint-448/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..d252dd4e5764106823080946500c02a8ed8c90c9
--- /dev/null
+++ b/dapo_lora_plus_20251202_001141/checkpoint-448/tokenizer_config.json
@@ -0,0 +1,194 @@
+{
+  "add_bos_token": true,
+  "add_eos_token": false,
+  "add_prefix_space": null,
+  "added_tokens_decoder": {
+    "151643": {
+      "content": "<｜end▁of▁sentence｜>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151644": {
+      "content": "<｜User｜>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151645": {
+      "content": "<｜Assistant｜>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151646": {
+      "content": "<｜begin▁of▁sentence｜>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151647": {
+      "content": "<|EOT|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151648": {
+      "content": "<think>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151649": {
+      "content": "</think>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151650": {
+      "content": "<|quad_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151651": {
+      "content": "<|quad_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151652": {
+      "content": "<|vision_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151653": {
+      "content": "<|vision_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151654": {
+      "content": "<|vision_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151655": {
+      "content": "<|image_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151656": {
+      "content": "<|video_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151657": {
+      "content": "<tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151658": {
+      "content": "</tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151659": {
+      "content": "<|fim_prefix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151660": {
+      "content": "<|fim_middle|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151661": {
+      "content": "<|fim_suffix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151662": {
+      "content": "<|fim_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151663": {
+      "content": "<|repo_name|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151664": {
+      "content": "<|file_sep|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    }
+  },
+  "bos_token": "<｜begin▁of▁sentence｜>",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<｜end▁of▁sentence｜>",
+  "extra_special_tokens": {},
+  "legacy": true,
+  "model_max_length": 16384,
+  "pad_token": "<｜end▁of▁sentence｜>",
+  "sp_model_kwargs": {},
+  "tokenizer_class": "LlamaTokenizerFast",
+  "unk_token": null,
+  "use_default_system_prompt": false
+}
diff --git a/dapo_lora_plus_20251202_001141/checkpoint-448/trainer_state.json b/dapo_lora_plus_20251202_001141/checkpoint-448/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..144608798fd985252409e72e2ff77d3c5e6f92a1
--- /dev/null
+++ b/dapo_lora_plus_20251202_001141/checkpoint-448/trainer_state.json
@@ -0,0 +1,13922 @@
+{
+  "best_global_step": null,
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 0.41214351425942963,
+  "eval_steps": 500,
+  "global_step": 448,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15689.0,
+      "completions/max_terminated_length": 15689.0,
+      "completions/mean_length": 6039.171875,
+      "completions/mean_terminated_length": 6039.171875,
+      "completions/min_length": 250.0,
+      "completions/min_terminated_length": 250.0,
+      "entropy": 1.19118632376194,
+      "epoch": 0.0009199632014719411,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0025745572056621313,
+      "learning_rate": 1e-05,
+      "loss": 0.0591,
+      "num_tokens": 792270.0,
+      "reward": 0.25,
+      "reward_std": 0.24435341358184814,
+      "rewards/accuracy_reward/mean": 0.25,
+      "rewards/accuracy_reward/std": 0.434714138507843,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999940395355225,
+      "sampling/importance_sampling_ratio/min": 0.0002457273658365011,
+      "sampling/sampling_logp_difference/max": 8.311287879943848,
+      "sampling/sampling_logp_difference/mean": 0.021642697975039482,
+      "step": 1
+    },
+    {
+      "clip_ratio/high_max": 5.499582130141789e-06,
+      "clip_ratio/high_mean": 1.3748955325354473e-06,
+      "clip_ratio/low_mean": 2.871888784738985e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.009378326623846e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 16292.0,
+      "completions/max_terminated_length": 16292.0,
+      "completions/mean_length": 4767.1875,
+      "completions/mean_terminated_length": 4767.1875,
+      "completions/min_length": 556.0,
+      "completions/min_terminated_length": 556.0,
+      "entropy": 1.088237851858139,
+      "epoch": 0.0018399264029438822,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.002068034838885069,
+      "learning_rate": 1e-05,
+      "loss": 0.0258,
+      "num_tokens": 1425798.0,
+      "reward": 0.3046875,
+      "reward_std": 0.29036980867385864,
+      "rewards/accuracy_reward/mean": 0.3046875,
+      "rewards/accuracy_reward/std": 0.46208351850509644,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999016523361206,
+      "sampling/importance_sampling_ratio/min": 0.01811397261917591,
+      "sampling/sampling_logp_difference/max": 4.011071681976318,
+      "sampling/sampling_logp_difference/mean": 0.01877593621611595,
+      "step": 2
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 4.459846724103045e-05,
+      "clip_ratio/low_min": 3.4060874440910993e-06,
+      "clip_ratio/region_mean": 4.459846724103045e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16317.0,
+      "completions/mean_length": 6586.359375,
+      "completions/mean_terminated_length": 6351.21630859375,
+      "completions/min_length": 4.0,
+      "completions/min_terminated_length": 4.0,
+      "entropy": 1.0497623533010483,
+      "epoch": 0.0027598896044158236,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.001971944235265255,
+      "learning_rate": 1e-05,
+      "loss": 0.0199,
+      "num_tokens": 2287420.0,
+      "reward": 0.28125,
+      "reward_std": 0.29143062233924866,
+      "rewards/accuracy_reward/mean": 0.28125,
+      "rewards/accuracy_reward/std": 0.4513758420944214,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999316334724426,
+      "sampling/importance_sampling_ratio/min": 5.356698966352269e-05,
+      "sampling/sampling_logp_difference/max": 9.834577560424805,
+      "sampling/sampling_logp_difference/mean": 0.02137824520468712,
+      "step": 3
+    },
+    {
+      "clip_ratio/high_max": 1.7640652004047297e-05,
+      "clip_ratio/high_mean": 5.48578327652649e-06,
+      "clip_ratio/low_mean": 3.218628648937738e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.767206976590387e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 14690.0,
+      "completions/max_terminated_length": 14690.0,
+      "completions/mean_length": 5448.0234375,
+      "completions/mean_terminated_length": 5448.0234375,
+      "completions/min_length": 707.0,
+      "completions/min_terminated_length": 707.0,
+      "entropy": 1.1134418621659279,
+      "epoch": 0.0036798528058877645,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0016465173102915287,
+      "learning_rate": 1e-05,
+      "loss": 0.0433,
+      "num_tokens": 3009167.0,
+      "reward": 0.2890625,
+      "reward_std": 0.27958330512046814,
+      "rewards/accuracy_reward/mean": 0.2890625,
+      "rewards/accuracy_reward/std": 0.45510825514793396,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999774694442749,
+      "sampling/importance_sampling_ratio/min": 7.889385415182915e-06,
+      "sampling/sampling_logp_difference/max": 11.749992370605469,
+      "sampling/sampling_logp_difference/mean": 0.020580951124429703,
+      "step": 4
+    },
+    {
+      "clip_ratio/high_max": 1.3439519989333348e-05,
+      "clip_ratio/high_mean": 3.359879997333337e-06,
+      "clip_ratio/low_mean": 2.8849915906903334e-05,
+      "clip_ratio/low_min": 8.467687621305231e-06,
+      "clip_ratio/region_mean": 3.220979442630778e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 13420.0,
+      "completions/mean_length": 5436.8671875,
+      "completions/mean_terminated_length": 5350.66943359375,
+      "completions/min_length": 26.0,
+      "completions/min_terminated_length": 26.0,
+      "entropy": 1.1473859176039696,
+      "epoch": 0.004599816007359705,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0023770295083522797,
+      "learning_rate": 1e-05,
+      "loss": 0.0153,
+      "num_tokens": 3725654.0,
+      "reward": 0.2734375,
+      "reward_std": 0.27434611320495605,
+      "rewards/accuracy_reward/mean": 0.2734375,
+      "rewards/accuracy_reward/std": 0.447474867105484,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.99991774559021,
+      "sampling/importance_sampling_ratio/min": 0.0011146117467433214,
+      "sampling/sampling_logp_difference/max": 6.799249172210693,
+      "sampling/sampling_logp_difference/mean": 0.020377254113554955,
+      "step": 5
+    },
+    {
+      "clip_ratio/high_max": 4.652201369026443e-06,
+      "clip_ratio/high_mean": 1.1630503422566107e-06,
+      "clip_ratio/low_mean": 2.8399212624208303e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.9562263534899103e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 14440.0,
+      "completions/max_terminated_length": 14440.0,
+      "completions/mean_length": 4697.5390625,
+      "completions/mean_terminated_length": 4697.5390625,
+      "completions/min_length": 445.0,
+      "completions/min_terminated_length": 445.0,
+      "entropy": 1.0097229778766632,
+      "epoch": 0.005519779208831647,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.003342699259519577,
+      "learning_rate": 1e-05,
+      "loss": 0.0326,
+      "num_tokens": 4345547.0,
+      "reward": 0.390625,
+      "reward_std": 0.34480881690979004,
+      "rewards/accuracy_reward/mean": 0.390625,
+      "rewards/accuracy_reward/std": 0.4898075461387634,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999914765357971,
+      "sampling/importance_sampling_ratio/min": 0.002385853324085474,
+      "sampling/sampling_logp_difference/max": 6.038198471069336,
+      "sampling/sampling_logp_difference/mean": 0.0185473021119833,
+      "step": 6
+    },
+    {
+      "clip_ratio/high_max": 9.362594937556423e-06,
+      "clip_ratio/high_mean": 2.340648734389106e-06,
+      "clip_ratio/low_mean": 6.054362825125281e-05,
+      "clip_ratio/low_min": 7.427356649714056e-06,
+      "clip_ratio/region_mean": 6.288427744038927e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14652.0,
+      "completions/mean_length": 6218.2109375,
+      "completions/mean_terminated_length": 5890.2822265625,
+      "completions/min_length": 156.0,
+      "completions/min_terminated_length": 156.0,
+      "entropy": 1.0579778030514717,
+      "epoch": 0.006439742410303588,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.002073560608550906,
+      "learning_rate": 1e-05,
+      "loss": 0.0201,
+      "num_tokens": 5160646.0,
+      "reward": 0.2109375,
+      "reward_std": 0.27222445607185364,
+      "rewards/accuracy_reward/mean": 0.2109375,
+      "rewards/accuracy_reward/std": 0.4095771610736847,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999560117721558,
+      "sampling/importance_sampling_ratio/min": 0.00044544730917550623,
+      "sampling/sampling_logp_difference/max": 7.716431617736816,
+      "sampling/sampling_logp_difference/mean": 0.020321575924754143,
+      "step": 7
+    },
+    {
+      "clip_ratio/high_max": 1.1064067621191498e-05,
+      "clip_ratio/high_mean": 2.7660169052978745e-06,
+      "clip_ratio/low_mean": 2.2175867059104348e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.4941883737028547e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 13637.0,
+      "completions/mean_length": 5127.8359375,
+      "completions/mean_terminated_length": 5039.20458984375,
+      "completions/min_length": 556.0,
+      "completions/min_terminated_length": 556.0,
+      "entropy": 1.0472618415951729,
+      "epoch": 0.007359705611775529,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0032994600478559732,
+      "learning_rate": 1e-05,
+      "loss": 0.0751,
+      "num_tokens": 5836289.0,
+      "reward": 0.3359375,
+      "reward_std": 0.2948455810546875,
+      "rewards/accuracy_reward/mean": 0.3359375,
+      "rewards/accuracy_reward/std": 0.47417303919792175,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999483227729797,
+      "sampling/importance_sampling_ratio/min": 0.0013780994340777397,
+      "sampling/sampling_logp_difference/max": 6.587049961090088,
+      "sampling/sampling_logp_difference/mean": 0.01940803974866867,
+      "step": 8
+    },
+    {
+      "clip_ratio/high_max": 1.2357884770608507e-05,
+      "clip_ratio/high_mean": 3.0894711926521268e-06,
+      "clip_ratio/low_mean": 3.000627111759968e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.309574231025181e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15916.0,
+      "completions/mean_length": 4516.890625,
+      "completions/mean_terminated_length": 4423.44873046875,
+      "completions/min_length": 238.0,
+      "completions/min_terminated_length": 238.0,
+      "entropy": 0.911251038312912,
+      "epoch": 0.00827966881324747,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.003016560571268201,
+      "learning_rate": 1e-05,
+      "loss": 0.1006,
+      "num_tokens": 6433171.0,
+      "reward": 0.390625,
+      "reward_std": 0.3066929578781128,
+      "rewards/accuracy_reward/mean": 0.390625,
+      "rewards/accuracy_reward/std": 0.4898075461387634,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999179840087891,
+      "sampling/importance_sampling_ratio/min": 0.005480794236063957,
+      "sampling/sampling_logp_difference/max": 5.206505298614502,
+      "sampling/sampling_logp_difference/mean": 0.017437148839235306,
+      "step": 9
+    },
+    {
+      "clip_ratio/high_max": 4.6329013457580004e-05,
+      "clip_ratio/high_mean": 1.1582253364395001e-05,
+      "clip_ratio/low_mean": 7.069455705277505e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 8.227681109929108e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 13970.0,
+      "completions/mean_length": 4961.453125,
+      "completions/mean_terminated_length": 4687.31201171875,
+      "completions/min_length": 483.0,
+      "completions/min_terminated_length": 483.0,
+      "entropy": 0.6808596402406693,
+      "epoch": 0.00919963201471941,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.0035386616364121437,
+      "learning_rate": 1e-05,
+      "loss": 0.0596,
+      "num_tokens": 7085389.0,
+      "reward": 0.5625,
+      "reward_std": 0.3816363215446472,
+      "rewards/accuracy_reward/mean": 0.5625,
+      "rewards/accuracy_reward/std": 0.49802759289741516,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999173879623413,
+      "sampling/importance_sampling_ratio/min": 0.0002734088629949838,
+      "sampling/sampling_logp_difference/max": 8.20454216003418,
+      "sampling/sampling_logp_difference/mean": 0.01566406339406967,
+      "step": 10
+    },
+    {
+      "clip_ratio/high_max": 2.43190661421977e-05,
+      "clip_ratio/high_mean": 6.079766535549425e-06,
+      "clip_ratio/low_mean": 2.2395396172214532e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.8475162707763957e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14776.0,
+      "completions/mean_length": 4429.40625,
+      "completions/mean_terminated_length": 4335.275390625,
+      "completions/min_length": 25.0,
+      "completions/min_terminated_length": 25.0,
+      "entropy": 0.9181502386927605,
+      "epoch": 0.010119595216191352,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.0022535293828696012,
+      "learning_rate": 1e-05,
+      "loss": 0.0031,
+      "num_tokens": 7672185.0,
+      "reward": 0.3671875,
+      "reward_std": 0.20357418060302734,
+      "rewards/accuracy_reward/mean": 0.3671875,
+      "rewards/accuracy_reward/std": 0.4839322865009308,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998801946640015,
+      "sampling/importance_sampling_ratio/min": 5.315856554943821e-08,
+      "sampling/sampling_logp_difference/max": 16.74998664855957,
+      "sampling/sampling_logp_difference/mean": 0.018429335206747055,
+      "step": 11
+    },
+    {
+      "clip_ratio/high_max": 1.0117325928149512e-05,
+      "clip_ratio/high_mean": 2.529331482037378e-06,
+      "clip_ratio/low_mean": 1.1982813475697185e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 1.45121450714214e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14029.0,
+      "completions/mean_length": 5282.6796875,
+      "completions/mean_terminated_length": 5106.46875,
+      "completions/min_length": 323.0,
+      "completions/min_terminated_length": 323.0,
+      "entropy": 1.113751620054245,
+      "epoch": 0.011039558417663294,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0013591813622042537,
+      "learning_rate": 1e-05,
+      "loss": 0.0971,
+      "num_tokens": 8369000.0,
+      "reward": 0.3984375,
+      "reward_std": 0.3029736578464508,
+      "rewards/accuracy_reward/mean": 0.3984375,
+      "rewards/accuracy_reward/std": 0.4915000796318054,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998897314071655,
+      "sampling/importance_sampling_ratio/min": 3.970265970565379e-05,
+      "sampling/sampling_logp_difference/max": 10.134092330932617,
+      "sampling/sampling_logp_difference/mean": 0.020221836864948273,
+      "step": 12
+    },
+    {
+      "clip_ratio/high_max": 5.411958227341529e-06,
+      "clip_ratio/high_mean": 1.3529895568353822e-06,
+      "clip_ratio/low_mean": 2.5284593846208736e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.6637583516730956e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15925.0,
+      "completions/mean_length": 6970.421875,
+      "completions/mean_terminated_length": 6744.49609375,
+      "completions/min_length": 53.0,
+      "completions/min_terminated_length": 53.0,
+      "entropy": 1.1721933633089066,
+      "epoch": 0.011959521619135235,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.0024079051800072193,
+      "learning_rate": 1e-05,
+      "loss": 0.0713,
+      "num_tokens": 9283182.0,
+      "reward": 0.171875,
+      "reward_std": 0.17965975403785706,
+      "rewards/accuracy_reward/mean": 0.171875,
+      "rewards/accuracy_reward/std": 0.3787541687488556,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999163746833801,
+      "sampling/importance_sampling_ratio/min": 0.0008915197686292231,
+      "sampling/sampling_logp_difference/max": 7.0225830078125,
+      "sampling/sampling_logp_difference/mean": 0.021462474018335342,
+      "step": 13
+    },
+    {
+      "clip_ratio/high_max": 2.0661535927501973e-05,
+      "clip_ratio/high_mean": 5.165383981875493e-06,
+      "clip_ratio/low_mean": 2.4304956298237812e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.947033948430544e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 14658.0,
+      "completions/max_terminated_length": 14658.0,
+      "completions/mean_length": 4886.875,
+      "completions/mean_terminated_length": 4886.875,
+      "completions/min_length": 434.0,
+      "completions/min_terminated_length": 434.0,
+      "entropy": 1.0108910650014877,
+      "epoch": 0.012879484820607176,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.002063734456896782,
+      "learning_rate": 1e-05,
+      "loss": 0.0386,
+      "num_tokens": 9928446.0,
+      "reward": 0.3515625,
+      "reward_std": 0.2409384697675705,
+      "rewards/accuracy_reward/mean": 0.3515625,
+      "rewards/accuracy_reward/std": 0.4793342351913452,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000026226043701,
+      "sampling/importance_sampling_ratio/min": 0.0003672837920021266,
+      "sampling/sampling_logp_difference/max": 7.9093756675720215,
+      "sampling/sampling_logp_difference/mean": 0.01918785460293293,
+      "step": 14
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 4.4761846993424115e-06,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.4761846993424115e-06,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 12992.0,
+      "completions/max_terminated_length": 12992.0,
+      "completions/mean_length": 4824.0078125,
+      "completions/mean_terminated_length": 4824.0078125,
+      "completions/min_length": 103.0,
+      "completions/min_terminated_length": 103.0,
+      "entropy": 1.1070282831788063,
+      "epoch": 0.013799448022079117,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.002424790756776929,
+      "learning_rate": 1e-05,
+      "loss": 0.0485,
+      "num_tokens": 10566415.0,
+      "reward": 0.28125,
+      "reward_std": 0.23698672652244568,
+      "rewards/accuracy_reward/mean": 0.28125,
+      "rewards/accuracy_reward/std": 0.4513758420944214,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000125169754028,
+      "sampling/importance_sampling_ratio/min": 0.0011708867968991399,
+      "sampling/sampling_logp_difference/max": 6.749993801116943,
+      "sampling/sampling_logp_difference/mean": 0.02069389820098877,
+      "step": 15
+    },
+    {
+      "clip_ratio/high_max": 3.5075904634140898e-06,
+      "clip_ratio/high_mean": 8.768976158535224e-07,
+      "clip_ratio/low_mean": 2.2676964135825983e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.3553861751679506e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 12685.0,
+      "completions/mean_length": 5449.4140625,
+      "completions/mean_terminated_length": 5363.31494140625,
+      "completions/min_length": 23.0,
+      "completions/min_terminated_length": 23.0,
+      "entropy": 0.9817888736724854,
+      "epoch": 0.014719411223551058,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0021046048495918512,
+      "learning_rate": 1e-05,
+      "loss": 0.0252,
+      "num_tokens": 11281908.0,
+      "reward": 0.2265625,
+      "reward_std": 0.27168765664100647,
+      "rewards/accuracy_reward/mean": 0.2265625,
+      "rewards/accuracy_reward/std": 0.4202519655227661,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999805688858032,
+      "sampling/importance_sampling_ratio/min": 0.013273254036903381,
+      "sampling/sampling_logp_difference/max": 4.322004318237305,
+      "sampling/sampling_logp_difference/mean": 0.019556276500225067,
+      "step": 16
+    },
+    {
+      "clip_ratio/high_max": 1.624216065465589e-05,
+      "clip_ratio/high_mean": 4.060540163663973e-06,
+      "clip_ratio/low_mean": 5.4349347919924185e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.840988796990132e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 14133.0,
+      "completions/max_terminated_length": 14133.0,
+      "completions/mean_length": 5343.25,
+      "completions/mean_terminated_length": 5343.25,
+      "completions/min_length": 324.0,
+      "completions/min_terminated_length": 324.0,
+      "entropy": 1.04741720110178,
+      "epoch": 0.015639374425023,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0035894038155674934,
+      "learning_rate": 1e-05,
+      "loss": 0.0584,
+      "num_tokens": 11987692.0,
+      "reward": 0.3359375,
+      "reward_std": 0.3124620020389557,
+      "rewards/accuracy_reward/mean": 0.3359375,
+      "rewards/accuracy_reward/std": 0.47417303919792175,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998996257781982,
+      "sampling/importance_sampling_ratio/min": 2.1446165192173794e-05,
+      "sampling/sampling_logp_difference/max": 10.749964714050293,
+      "sampling/sampling_logp_difference/mean": 0.020530637353658676,
+      "step": 17
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 4.272115029380075e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.272115029380075e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15138.0,
+      "completions/mean_length": 6301.9375,
+      "completions/mean_terminated_length": 5806.09814453125,
+      "completions/min_length": 72.0,
+      "completions/min_terminated_length": 72.0,
+      "entropy": 0.8892941772937775,
+      "epoch": 0.01655933762649494,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0032246762420982122,
+      "learning_rate": 1e-05,
+      "loss": 0.0811,
+      "num_tokens": 12814244.0,
+      "reward": 0.3125,
+      "reward_std": 0.3606000542640686,
+      "rewards/accuracy_reward/mean": 0.3125,
+      "rewards/accuracy_reward/std": 0.4653336703777313,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999184608459473,
+      "sampling/importance_sampling_ratio/min": 0.021351110190153122,
+      "sampling/sampling_logp_difference/max": 3.846651554107666,
+      "sampling/sampling_logp_difference/mean": 0.017541853711009026,
+      "step": 18
+    },
+    {
+      "clip_ratio/high_max": 9.956602298188955e-06,
+      "clip_ratio/high_mean": 2.4891505745472386e-06,
+      "clip_ratio/low_mean": 2.772165316855535e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.0210803743102588e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 16213.0,
+      "completions/max_terminated_length": 16213.0,
+      "completions/mean_length": 5297.46875,
+      "completions/mean_terminated_length": 5297.46875,
+      "completions/min_length": 2.0,
+      "completions/min_terminated_length": 2.0,
+      "entropy": 0.8097029253840446,
+      "epoch": 0.017479300827966882,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0023969109170138836,
+      "learning_rate": 1e-05,
+      "loss": -0.0153,
+      "num_tokens": 13512520.0,
+      "reward": 0.359375,
+      "reward_std": 0.248829185962677,
+      "rewards/accuracy_reward/mean": 0.359375,
+      "rewards/accuracy_reward/std": 0.481702595949173,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999222159385681,
+      "sampling/importance_sampling_ratio/min": 0.005766105372458696,
+      "sampling/sampling_logp_difference/max": 5.155758380889893,
+      "sampling/sampling_logp_difference/mean": 0.017464376986026764,
+      "step": 19
+    },
+    {
+      "clip_ratio/high_max": 1.0098337497765897e-05,
+      "clip_ratio/high_mean": 2.524584374441474e-06,
+      "clip_ratio/low_mean": 3.173396362399217e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.425854845318099e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14655.0,
+      "completions/mean_length": 4890.34375,
+      "completions/mean_terminated_length": 4799.84228515625,
+      "completions/min_length": 68.0,
+      "completions/min_terminated_length": 68.0,
+      "entropy": 0.9267145916819572,
+      "epoch": 0.01839926402943882,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.002759338356554508,
+      "learning_rate": 1e-05,
+      "loss": -0.0014,
+      "num_tokens": 14155556.0,
+      "reward": 0.3515625,
+      "reward_std": 0.31010788679122925,
+      "rewards/accuracy_reward/mean": 0.3515625,
+      "rewards/accuracy_reward/std": 0.4793342351913452,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999570250511169,
+      "sampling/importance_sampling_ratio/min": 0.008491010405123234,
+      "sampling/sampling_logp_difference/max": 4.768747329711914,
+      "sampling/sampling_logp_difference/mean": 0.018839433789253235,
+      "step": 20
+    },
+    {
+      "clip_ratio/high_max": 7.532389190600952e-06,
+      "clip_ratio/high_mean": 1.883097297650238e-06,
+      "clip_ratio/low_mean": 1.9051809317716106e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.0934906729053182e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 16296.0,
+      "completions/max_terminated_length": 16296.0,
+      "completions/mean_length": 4609.40625,
+      "completions/mean_terminated_length": 4609.40625,
+      "completions/min_length": 461.0,
+      "completions/min_terminated_length": 461.0,
+      "entropy": 1.171089917421341,
+      "epoch": 0.019319227230910764,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0021055075339972973,
+      "learning_rate": 1e-05,
+      "loss": -0.0051,
+      "num_tokens": 14765328.0,
+      "reward": 0.2421875,
+      "reward_std": 0.2409384548664093,
+      "rewards/accuracy_reward/mean": 0.2421875,
+      "rewards/accuracy_reward/std": 0.4300905168056488,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999741911888123,
+      "sampling/importance_sampling_ratio/min": 5.368983693188056e-07,
+      "sampling/sampling_logp_difference/max": 14.437457084655762,
+      "sampling/sampling_logp_difference/mean": 0.020226795226335526,
+      "step": 21
+    },
+    {
+      "clip_ratio/high_max": 1.7169573766295798e-05,
+      "clip_ratio/high_mean": 4.2923934415739495e-06,
+      "clip_ratio/low_mean": 5.869748633813288e-06,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 1.0162142189074075e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14299.0,
+      "completions/mean_length": 5099.0390625,
+      "completions/mean_terminated_length": 5010.18115234375,
+      "completions/min_length": 539.0,
+      "completions/min_terminated_length": 539.0,
+      "entropy": 1.005959376692772,
+      "epoch": 0.020239190432382703,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0027595218271017075,
+      "learning_rate": 1e-05,
+      "loss": 0.0236,
+      "num_tokens": 15438549.0,
+      "reward": 0.296875,
+      "reward_std": 0.20069602131843567,
+      "rewards/accuracy_reward/mean": 0.296875,
+      "rewards/accuracy_reward/std": 0.45867621898651123,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999887347221375,
+      "sampling/importance_sampling_ratio/min": 0.00013984869292471558,
+      "sampling/sampling_logp_difference/max": 8.87494945526123,
+      "sampling/sampling_logp_difference/mean": 0.01902824640274048,
+      "step": 22
+    },
+    {
+      "clip_ratio/high_max": 5.162942670722259e-06,
+      "clip_ratio/high_mean": 1.2907356676805648e-06,
+      "clip_ratio/low_mean": 3.6872071063953626e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.816280593582633e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16204.0,
+      "completions/mean_length": 7138.0390625,
+      "completions/mean_terminated_length": 6839.7822265625,
+      "completions/min_length": 729.0,
+      "completions/min_terminated_length": 729.0,
+      "entropy": 1.0403362140059471,
+      "epoch": 0.021159153633854646,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.002748022088780999,
+      "learning_rate": 1e-05,
+      "loss": 0.0647,
+      "num_tokens": 16373898.0,
+      "reward": 0.296875,
+      "reward_std": 0.3169426918029785,
+      "rewards/accuracy_reward/mean": 0.296875,
+      "rewards/accuracy_reward/std": 0.45867621898651123,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999048709869385,
+      "sampling/importance_sampling_ratio/min": 0.0003802926803473383,
+      "sampling/sampling_logp_difference/max": 7.874569416046143,
+      "sampling/sampling_logp_difference/mean": 0.020853528752923012,
+      "step": 23
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 5.6506045439164154e-05,
+      "clip_ratio/low_min": 5.709326615033206e-06,
+      "clip_ratio/region_mean": 5.6506045439164154e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14543.0,
+      "completions/mean_length": 5420.515625,
+      "completions/mean_terminated_length": 5334.18896484375,
+      "completions/min_length": 19.0,
+      "completions/min_terminated_length": 19.0,
+      "entropy": 1.1339883506298065,
+      "epoch": 0.02207911683532659,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0029502976685762405,
+      "learning_rate": 1e-05,
+      "loss": 0.0756,
+      "num_tokens": 17088156.0,
+      "reward": 0.1953125,
+      "reward_std": 0.25620076060295105,
+      "rewards/accuracy_reward/mean": 0.1953125,
+      "rewards/accuracy_reward/std": 0.3979988098144531,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999445676803589,
+      "sampling/importance_sampling_ratio/min": 9.70982582657598e-05,
+      "sampling/sampling_logp_difference/max": 9.239787101745605,
+      "sampling/sampling_logp_difference/mean": 0.0199423898011446,
+      "step": 24
+    },
+    {
+      "clip_ratio/high_max": 5.619998319161823e-06,
+      "clip_ratio/high_mean": 1.4049995797904558e-06,
+      "clip_ratio/low_mean": 6.439320418394345e-05,
+      "clip_ratio/low_min": 4.70632539872895e-06,
+      "clip_ratio/region_mean": 6.57982034226734e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14636.0,
+      "completions/mean_length": 5116.3046875,
+      "completions/mean_terminated_length": 4845.88037109375,
+      "completions/min_length": 10.0,
+      "completions/min_terminated_length": 10.0,
+      "entropy": 0.9503882825374603,
+      "epoch": 0.022999080036798528,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.004891107324510813,
+      "learning_rate": 1e-05,
+      "loss": 0.0522,
+      "num_tokens": 17766619.0,
+      "reward": 0.3203125,
+      "reward_std": 0.3366856575012207,
+      "rewards/accuracy_reward/mean": 0.3203125,
+      "rewards/accuracy_reward/std": 0.4684300124645233,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999970018863678,
+      "sampling/importance_sampling_ratio/min": 0.0010618992382660508,
+      "sampling/sampling_logp_difference/max": 6.847696304321289,
+      "sampling/sampling_logp_difference/mean": 0.01914183795452118,
+      "step": 25
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 3.839018643247982e-05,
+      "clip_ratio/low_min": 4.115091087442124e-06,
+      "clip_ratio/region_mean": 3.839018643247982e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14634.0,
+      "completions/mean_length": 5061.8671875,
+      "completions/mean_terminated_length": 4972.71630859375,
+      "completions/min_length": 281.0,
+      "completions/min_terminated_length": 281.0,
+      "entropy": 1.0540335327386856,
+      "epoch": 0.02391904323827047,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0030373274348676205,
+      "learning_rate": 1e-05,
+      "loss": 0.0246,
+      "num_tokens": 18432938.0,
+      "reward": 0.34375,
+      "reward_std": 0.28118088841438293,
+      "rewards/accuracy_reward/mean": 0.34375,
+      "rewards/accuracy_reward/std": 0.47682511806488037,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999624490737915,
+      "sampling/importance_sampling_ratio/min": 1.7212972807101323e-06,
+      "sampling/sampling_logp_difference/max": 13.272432327270508,
+      "sampling/sampling_logp_difference/mean": 0.019548218697309494,
+      "step": 26
+    },
+    {
+      "clip_ratio/high_max": 1.4656657867817557e-05,
+      "clip_ratio/high_mean": 4.665093399580655e-06,
+      "clip_ratio/low_mean": 3.751162262233265e-05,
+      "clip_ratio/low_min": 4.413062470121076e-06,
+      "clip_ratio/region_mean": 4.2176716192443564e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15782.0,
+      "completions/max_terminated_length": 15782.0,
+      "completions/mean_length": 6349.9765625,
+      "completions/mean_terminated_length": 6349.9765625,
+      "completions/min_length": 2.0,
+      "completions/min_terminated_length": 2.0,
+      "entropy": 1.0268081277608871,
+      "epoch": 0.02483900643974241,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0017623496241867542,
+      "learning_rate": 1e-05,
+      "loss": 0.0011,
+      "num_tokens": 19264743.0,
+      "reward": 0.2734375,
+      "reward_std": 0.33903974294662476,
+      "rewards/accuracy_reward/mean": 0.2734375,
+      "rewards/accuracy_reward/std": 0.447474867105484,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000061988830566,
+      "sampling/importance_sampling_ratio/min": 6.870362267363816e-05,
+      "sampling/sampling_logp_difference/max": 9.585708618164062,
+      "sampling/sampling_logp_difference/mean": 0.019106190651655197,
+      "step": 27
+    },
+    {
+      "clip_ratio/high_max": 9.221375876222737e-06,
+      "clip_ratio/high_mean": 2.3053439690556843e-06,
+      "clip_ratio/low_mean": 3.09787185415189e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.328406273794826e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15944.0,
+      "completions/mean_length": 5815.484375,
+      "completions/mean_terminated_length": 5561.84033203125,
+      "completions/min_length": 607.0,
+      "completions/min_terminated_length": 607.0,
+      "entropy": 1.0389493256807327,
+      "epoch": 0.025758969641214352,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.003111837198957801,
+      "learning_rate": 1e-05,
+      "loss": -0.0162,
+      "num_tokens": 20030109.0,
+      "reward": 0.34375,
+      "reward_std": 0.32719242572784424,
+      "rewards/accuracy_reward/mean": 0.34375,
+      "rewards/accuracy_reward/std": 0.47682511806488037,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000298023223877,
+      "sampling/importance_sampling_ratio/min": 0.02987043187022209,
+      "sampling/sampling_logp_difference/max": 3.5108861923217773,
+      "sampling/sampling_logp_difference/mean": 0.020060991868376732,
+      "step": 28
+    },
+    {
+      "clip_ratio/high_max": 6.7810142354574054e-06,
+      "clip_ratio/high_mean": 1.6952535588643514e-06,
+      "clip_ratio/low_mean": 4.474762545214617e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.644287901101052e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15371.0,
+      "completions/mean_length": 5157.1484375,
+      "completions/mean_terminated_length": 5068.748046875,
+      "completions/min_length": 11.0,
+      "completions/min_terminated_length": 11.0,
+      "entropy": 1.0510126948356628,
+      "epoch": 0.02667893284268629,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.003041633637621999,
+      "learning_rate": 1e-05,
+      "loss": 0.0471,
+      "num_tokens": 20710904.0,
+      "reward": 0.3125,
+      "reward_std": 0.35612428188323975,
+      "rewards/accuracy_reward/mean": 0.3125,
+      "rewards/accuracy_reward/std": 0.4653336703777313,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999587535858154,
+      "sampling/importance_sampling_ratio/min": 0.04357198625802994,
+      "sampling/sampling_logp_difference/max": 3.133340835571289,
+      "sampling/sampling_logp_difference/mean": 0.019007597118616104,
+      "step": 29
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 2.0962848566341563e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.0962848566341563e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15333.0,
+      "completions/max_terminated_length": 15333.0,
+      "completions/mean_length": 4446.3828125,
+      "completions/mean_terminated_length": 4446.3828125,
+      "completions/min_length": 73.0,
+      "completions/min_terminated_length": 73.0,
+      "entropy": 1.053279548883438,
+      "epoch": 0.027598896044158234,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0022369560319930315,
+      "learning_rate": 1e-05,
+      "loss": -0.001,
+      "num_tokens": 21298497.0,
+      "reward": 0.390625,
+      "reward_std": 0.24169495701789856,
+      "rewards/accuracy_reward/mean": 0.390625,
+      "rewards/accuracy_reward/std": 0.4898075461387634,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998750686645508,
+      "sampling/importance_sampling_ratio/min": 0.006704842206090689,
+      "sampling/sampling_logp_difference/max": 5.00492525100708,
+      "sampling/sampling_logp_difference/mean": 0.01947362720966339,
+      "step": 30
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 2.8460265411922592e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.8460265411922592e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15386.0,
+      "completions/mean_length": 6294.1484375,
+      "completions/mean_terminated_length": 6133.9921875,
+      "completions/min_length": 548.0,
+      "completions/min_terminated_length": 548.0,
+      "entropy": 1.2036212533712387,
+      "epoch": 0.028518859245630176,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0021383841522037983,
+      "learning_rate": 1e-05,
+      "loss": 0.033,
+      "num_tokens": 22124812.0,
+      "reward": 0.171875,
+      "reward_std": 0.20752590894699097,
+      "rewards/accuracy_reward/mean": 0.171875,
+      "rewards/accuracy_reward/std": 0.3787541687488556,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999858736991882,
+      "sampling/importance_sampling_ratio/min": 3.9575263599544996e-07,
+      "sampling/sampling_logp_difference/max": 14.742476463317871,
+      "sampling/sampling_logp_difference/mean": 0.022367021068930626,
+      "step": 31
+    },
+    {
+      "clip_ratio/high_max": 1.73864664247958e-05,
+      "clip_ratio/high_mean": 4.34661660619895e-06,
+      "clip_ratio/low_mean": 3.19569651310303e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.630358173722925e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14893.0,
+      "completions/mean_length": 6011.4921875,
+      "completions/mean_terminated_length": 5929.81884765625,
+      "completions/min_length": 2.0,
+      "completions/min_terminated_length": 2.0,
+      "entropy": 1.123318687081337,
+      "epoch": 0.029438822447102116,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.00126531848218292,
+      "learning_rate": 1e-05,
+      "loss": 0.0119,
+      "num_tokens": 22915091.0,
+      "reward": 0.171875,
+      "reward_std": 0.2330477386713028,
+      "rewards/accuracy_reward/mean": 0.171875,
+      "rewards/accuracy_reward/std": 0.3787541687488556,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999861121177673,
+      "sampling/importance_sampling_ratio/min": 1.6368276192224585e-05,
+      "sampling/sampling_logp_difference/max": 11.02016544342041,
+      "sampling/sampling_logp_difference/mean": 0.019905246794223785,
+      "step": 32
+    },
+    {
+      "clip_ratio/high_max": 2.8753217975463485e-05,
+      "clip_ratio/high_mean": 7.188304493865871e-06,
+      "clip_ratio/low_mean": 3.818478444372886e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.537308905128157e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16332.0,
+      "completions/mean_length": 5152.46875,
+      "completions/mean_terminated_length": 5064.03125,
+      "completions/min_length": 128.0,
+      "completions/min_terminated_length": 128.0,
+      "entropy": 1.0477670058608055,
+      "epoch": 0.03035878564857406,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0030069497879594564,
+      "learning_rate": 1e-05,
+      "loss": 0.1026,
+      "num_tokens": 23596487.0,
+      "reward": 0.3359375,
+      "reward_std": 0.29142576456069946,
+      "rewards/accuracy_reward/mean": 0.3359375,
+      "rewards/accuracy_reward/std": 0.47417303919792175,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999433755874634,
+      "sampling/importance_sampling_ratio/min": 9.009604013954231e-07,
+      "sampling/sampling_logp_difference/max": 13.919804573059082,
+      "sampling/sampling_logp_difference/mean": 0.019003981724381447,
+      "step": 33
+    },
+    {
+      "clip_ratio/high_max": 3.069575450354023e-05,
+      "clip_ratio/high_mean": 7.673938625885057e-06,
+      "clip_ratio/low_mean": 3.4847614415411954e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.252155258654966e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 12792.0,
+      "completions/max_terminated_length": 12792.0,
+      "completions/mean_length": 4672.5703125,
+      "completions/mean_terminated_length": 4672.5703125,
+      "completions/min_length": 4.0,
+      "completions/min_terminated_length": 4.0,
+      "entropy": 0.9471446052193642,
+      "epoch": 0.031278748850046,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.002676331205293536,
+      "learning_rate": 1e-05,
+      "loss": 0.0724,
+      "num_tokens": 24213408.0,
+      "reward": 0.3203125,
+      "reward_std": 0.2988021969795227,
+      "rewards/accuracy_reward/mean": 0.3203125,
+      "rewards/accuracy_reward/std": 0.4684300124645233,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000251531600952,
+      "sampling/importance_sampling_ratio/min": 0.0013351094676181674,
+      "sampling/sampling_logp_difference/max": 6.618741989135742,
+      "sampling/sampling_logp_difference/mean": 0.0179576613008976,
+      "step": 34
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 2.6127243245355203e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.6127243245355203e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16108.0,
+      "completions/mean_length": 7013.734375,
+      "completions/mean_terminated_length": 6711.4677734375,
+      "completions/min_length": 326.0,
+      "completions/min_terminated_length": 326.0,
+      "entropy": 1.1254516392946243,
+      "epoch": 0.03219871205151794,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0023615453392267227,
+      "learning_rate": 1e-05,
+      "loss": 0.0384,
+      "num_tokens": 25130262.0,
+      "reward": 0.1953125,
+      "reward_std": 0.26485776901245117,
+      "rewards/accuracy_reward/mean": 0.1953125,
+      "rewards/accuracy_reward/std": 0.3979988098144531,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999954342842102,
+      "sampling/importance_sampling_ratio/min": 6.6197676460433286e-06,
+      "sampling/sampling_logp_difference/max": 11.925450325012207,
+      "sampling/sampling_logp_difference/mean": 0.0215257927775383,
+      "step": 35
+    },
+    {
+      "clip_ratio/high_max": 4.06954040954588e-06,
+      "clip_ratio/high_mean": 1.01738510238647e-06,
+      "clip_ratio/low_mean": 4.180071573500754e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.281810015527299e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15673.0,
+      "completions/mean_length": 5858.59375,
+      "completions/mean_terminated_length": 5605.984375,
+      "completions/min_length": 189.0,
+      "completions/min_terminated_length": 189.0,
+      "entropy": 1.0713739022612572,
+      "epoch": 0.03311867525298988,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0029018481727689505,
+      "learning_rate": 1e-05,
+      "loss": 0.1041,
+      "num_tokens": 25898194.0,
+      "reward": 0.3671875,
+      "reward_std": 0.29036980867385864,
+      "rewards/accuracy_reward/mean": 0.3671875,
+      "rewards/accuracy_reward/std": 0.4839322865009308,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999915957450867,
+      "sampling/importance_sampling_ratio/min": 1.6834765119710937e-05,
+      "sampling/sampling_logp_difference/max": 10.992064476013184,
+      "sampling/sampling_logp_difference/mean": 0.019959844648838043,
+      "step": 36
+    },
+    {
+      "clip_ratio/high_max": 1.2810827229259303e-05,
+      "clip_ratio/high_mean": 3.2027068073148257e-06,
+      "clip_ratio/low_mean": 3.29701083501277e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.617281504375569e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14004.0,
+      "completions/mean_length": 6952.6015625,
+      "completions/mean_terminated_length": 6726.24853515625,
+      "completions/min_length": 3.0,
+      "completions/min_terminated_length": 3.0,
+      "entropy": 1.028619796037674,
+      "epoch": 0.03403863845446182,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0022342968732118607,
+      "learning_rate": 1e-05,
+      "loss": 0.0637,
+      "num_tokens": 26812791.0,
+      "reward": 0.234375,
+      "reward_std": 0.26827272772789,
+      "rewards/accuracy_reward/mean": 0.234375,
+      "rewards/accuracy_reward/std": 0.42527204751968384,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999532699584961,
+      "sampling/importance_sampling_ratio/min": 4.540153167909011e-05,
+      "sampling/sampling_logp_difference/max": 9.999964714050293,
+      "sampling/sampling_logp_difference/mean": 0.02002539485692978,
+      "step": 37
+    },
+    {
+      "clip_ratio/high_max": 1.5225089100567857e-05,
+      "clip_ratio/high_mean": 6.960676159906143e-06,
+      "clip_ratio/low_mean": 4.09088329433871e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.7869508762232726e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16361.0,
+      "completions/mean_length": 6413.421875,
+      "completions/mean_terminated_length": 6174.12841796875,
+      "completions/min_length": 2.0,
+      "completions/min_terminated_length": 2.0,
+      "entropy": 0.9452399462461472,
+      "epoch": 0.034958601655933765,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0021800603717565536,
+      "learning_rate": 1e-05,
+      "loss": 0.0275,
+      "num_tokens": 27652757.0,
+      "reward": 0.296875,
+      "reward_std": 0.31246688961982727,
+      "rewards/accuracy_reward/mean": 0.296875,
+      "rewards/accuracy_reward/std": 0.45867621898651123,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999439120292664,
+      "sampling/importance_sampling_ratio/min": 3.895394547726028e-05,
+      "sampling/sampling_logp_difference/max": 10.153130531311035,
+      "sampling/sampling_logp_difference/mean": 0.019722118973731995,
+      "step": 38
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 3.9564903318023426e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.9564903318023426e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15754.0,
+      "completions/max_terminated_length": 15754.0,
+      "completions/mean_length": 5176.3515625,
+      "completions/mean_terminated_length": 5176.3515625,
+      "completions/min_length": 130.0,
+      "completions/min_terminated_length": 130.0,
+      "entropy": 1.0444758981466293,
+      "epoch": 0.035878564857405704,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.004153470974415541,
+      "learning_rate": 1e-05,
+      "loss": 0.0798,
+      "num_tokens": 28334386.0,
+      "reward": 0.2734375,
+      "reward_std": 0.32035762071609497,
+      "rewards/accuracy_reward/mean": 0.2734375,
+      "rewards/accuracy_reward/std": 0.447474867105484,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999774694442749,
+      "sampling/importance_sampling_ratio/min": 0.007421077694743872,
+      "sampling/sampling_logp_difference/max": 4.903430938720703,
+      "sampling/sampling_logp_difference/mean": 0.020159056410193443,
+      "step": 39
+    },
+    {
+      "clip_ratio/high_max": 1.725743459246587e-05,
+      "clip_ratio/high_mean": 4.3143586481164675e-06,
+      "clip_ratio/low_mean": 2.0204584302518924e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.451894306432223e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15554.0,
+      "completions/mean_length": 5178.9921875,
+      "completions/mean_terminated_length": 5001.13525390625,
+      "completions/min_length": 4.0,
+      "completions/min_terminated_length": 4.0,
+      "entropy": 1.0803537145256996,
+      "epoch": 0.03679852805887764,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.002477057045325637,
+      "learning_rate": 1e-05,
+      "loss": 0.0067,
+      "num_tokens": 29017145.0,
+      "reward": 0.2890625,
+      "reward_std": 0.29932135343551636,
+      "rewards/accuracy_reward/mean": 0.2890625,
+      "rewards/accuracy_reward/std": 0.45510825514793396,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000497102737427,
+      "sampling/importance_sampling_ratio/min": 0.004630985204130411,
+      "sampling/sampling_logp_difference/max": 5.374985694885254,
+      "sampling/sampling_logp_difference/mean": 0.019826076924800873,
+      "step": 40
+    },
+    {
+      "clip_ratio/high_max": 1.6637992303003557e-05,
+      "clip_ratio/high_mean": 4.159498075750889e-06,
+      "clip_ratio/low_mean": 2.1970684144889674e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.6130182106953725e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 14131.0,
+      "completions/max_terminated_length": 14131.0,
+      "completions/mean_length": 4980.359375,
+      "completions/mean_terminated_length": 4980.359375,
+      "completions/min_length": 329.0,
+      "completions/min_terminated_length": 329.0,
+      "entropy": 0.9510642662644386,
+      "epoch": 0.03771849126034959,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0016275218222290277,
+      "learning_rate": 1e-05,
+      "loss": -0.0097,
+      "num_tokens": 29673535.0,
+      "reward": 0.4375,
+      "reward_std": 0.26249876618385315,
+      "rewards/accuracy_reward/mean": 0.4375,
+      "rewards/accuracy_reward/std": 0.49802759289741516,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999750852584839,
+      "sampling/importance_sampling_ratio/min": 0.000599516904912889,
+      "sampling/sampling_logp_difference/max": 7.419386386871338,
+      "sampling/sampling_logp_difference/mean": 0.01844976656138897,
+      "step": 41
+    },
+    {
+      "clip_ratio/high_max": 2.8087193186365766e-05,
+      "clip_ratio/high_mean": 7.021798296591442e-06,
+      "clip_ratio/low_mean": 3.9683913541921356e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.670571286169434e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15328.0,
+      "completions/mean_length": 5778.6953125,
+      "completions/mean_terminated_length": 5695.18896484375,
+      "completions/min_length": 691.0,
+      "completions/min_terminated_length": 691.0,
+      "entropy": 1.0413239300251007,
+      "epoch": 0.03863845446182153,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.001847646082751453,
+      "learning_rate": 1e-05,
+      "loss": -0.0045,
+      "num_tokens": 30436416.0,
+      "reward": 0.2578125,
+      "reward_std": 0.33903977274894714,
+      "rewards/accuracy_reward/mean": 0.2578125,
+      "rewards/accuracy_reward/std": 0.43914902210235596,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998501539230347,
+      "sampling/importance_sampling_ratio/min": 0.00020348970429040492,
+      "sampling/sampling_logp_difference/max": 8.499895095825195,
+      "sampling/sampling_logp_difference/mean": 0.021502099931240082,
+      "step": 42
+    },
+    {
+      "clip_ratio/high_max": 2.68402091023745e-05,
+      "clip_ratio/high_mean": 8.575278570788214e-06,
+      "clip_ratio/low_mean": 4.547183698377921e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.404711600931478e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 14182.0,
+      "completions/max_terminated_length": 14182.0,
+      "completions/mean_length": 4875.125,
+      "completions/mean_terminated_length": 4875.125,
+      "completions/min_length": 349.0,
+      "completions/min_terminated_length": 349.0,
+      "entropy": 1.0464690178632736,
+      "epoch": 0.03955841766329347,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0021134833805263042,
+      "learning_rate": 1e-05,
+      "loss": 0.0727,
+      "num_tokens": 31083672.0,
+      "reward": 0.40625,
+      "reward_std": 0.3584783971309662,
+      "rewards/accuracy_reward/mean": 0.40625,
+      "rewards/accuracy_reward/std": 0.4930621087551117,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999340176582336,
+      "sampling/importance_sampling_ratio/min": 0.012113225646317005,
+      "sampling/sampling_logp_difference/max": 4.41345739364624,
+      "sampling/sampling_logp_difference/mean": 0.019140049815177917,
+      "step": 43
+    },
+    {
+      "clip_ratio/high_max": 3.9877967992651975e-05,
+      "clip_ratio/high_mean": 9.969491998162994e-06,
+      "clip_ratio/low_mean": 3.981287841270387e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.9782369273998484e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15959.0,
+      "completions/mean_length": 4691.421875,
+      "completions/mean_terminated_length": 4505.82568359375,
+      "completions/min_length": 296.0,
+      "completions/min_terminated_length": 296.0,
+      "entropy": 1.0229775309562683,
+      "epoch": 0.040478380864765406,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0037735572550445795,
+      "learning_rate": 1e-05,
+      "loss": 0.0603,
+      "num_tokens": 31703654.0,
+      "reward": 0.4453125,
+      "reward_std": 0.2993389964103699,
+      "rewards/accuracy_reward/mean": 0.4453125,
+      "rewards/accuracy_reward/std": 0.4989531338214874,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999492168426514,
+      "sampling/importance_sampling_ratio/min": 0.03150063753128052,
+      "sampling/sampling_logp_difference/max": 3.457747459411621,
+      "sampling/sampling_logp_difference/mean": 0.01912039890885353,
+      "step": 44
+    },
+    {
+      "clip_ratio/high_max": 3.5441889849607833e-06,
+      "clip_ratio/high_mean": 8.860472462401958e-07,
+      "clip_ratio/low_mean": 1.5137359810069029e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 1.6023407056309225e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15605.0,
+      "completions/mean_length": 6821.96875,
+      "completions/mean_terminated_length": 6592.48046875,
+      "completions/min_length": 1196.0,
+      "completions/min_terminated_length": 1196.0,
+      "entropy": 1.1132484003901482,
+      "epoch": 0.04139834406623735,
+      "frac_reward_zero_std": 0.625,
+      "grad_norm": 0.0010448681423440576,
+      "learning_rate": 1e-05,
+      "loss": 0.022,
+      "num_tokens": 32599778.0,
+      "reward": 0.2265625,
+      "reward_std": 0.1814819872379303,
+      "rewards/accuracy_reward/mean": 0.2265625,
+      "rewards/accuracy_reward/std": 0.4202519655227661,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999915361404419,
+      "sampling/importance_sampling_ratio/min": 0.006500681862235069,
+      "sampling/sampling_logp_difference/max": 5.035848140716553,
+      "sampling/sampling_logp_difference/mean": 0.02125459350645542,
+      "step": 45
+    },
+    {
+      "clip_ratio/high_max": 4.652893949241843e-06,
+      "clip_ratio/high_mean": 1.1632234873104608e-06,
+      "clip_ratio/low_mean": 5.731516603191267e-05,
+      "clip_ratio/low_min": 9.891066838463303e-06,
+      "clip_ratio/region_mean": 5.8478389746596804e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15753.0,
+      "completions/mean_length": 6834.3671875,
+      "completions/mean_terminated_length": 6605.17626953125,
+      "completions/min_length": 624.0,
+      "completions/min_terminated_length": 624.0,
+      "entropy": 0.9827468693256378,
+      "epoch": 0.04231830726770929,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.0017670176457613707,
+      "learning_rate": 1e-05,
+      "loss": 0.1105,
+      "num_tokens": 33492737.0,
+      "reward": 0.3046875,
+      "reward_std": 0.3440523147583008,
+      "rewards/accuracy_reward/mean": 0.3046875,
+      "rewards/accuracy_reward/std": 0.46208351850509644,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999089241027832,
+      "sampling/importance_sampling_ratio/min": 0.0021202093921601772,
+      "sampling/sampling_logp_difference/max": 6.156240463256836,
+      "sampling/sampling_logp_difference/mean": 0.019490526989102364,
+      "step": 46
+    },
+    {
+      "clip_ratio/high_max": 6.717360520269722e-06,
+      "clip_ratio/high_mean": 2.503530367903295e-06,
+      "clip_ratio/low_mean": 2.5672919832686603e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.8176450200589898e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14098.0,
+      "completions/mean_length": 6175.296875,
+      "completions/mean_terminated_length": 5845.98388671875,
+      "completions/min_length": 558.0,
+      "completions/min_terminated_length": 558.0,
+      "entropy": 1.1584237962961197,
+      "epoch": 0.04323827046918123,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.0016891945851966739,
+      "learning_rate": 1e-05,
+      "loss": -0.0008,
+      "num_tokens": 34312455.0,
+      "reward": 0.1875,
+      "reward_std": 0.19673937559127808,
+      "rewards/accuracy_reward/mean": 0.1875,
+      "rewards/accuracy_reward/std": 0.39184603095054626,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999643564224243,
+      "sampling/importance_sampling_ratio/min": 8.086384332273155e-05,
+      "sampling/sampling_logp_difference/max": 9.422743797302246,
+      "sampling/sampling_logp_difference/mean": 0.021749887615442276,
+      "step": 47
+    },
+    {
+      "clip_ratio/high_max": 2.2362002255249536e-05,
+      "clip_ratio/high_mean": 8.189798336388776e-06,
+      "clip_ratio/low_mean": 2.1058204993096297e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.9248002192616696e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16054.0,
+      "completions/mean_length": 6036.8359375,
+      "completions/mean_terminated_length": 5955.3623046875,
+      "completions/min_length": 510.0,
+      "completions/min_terminated_length": 510.0,
+      "entropy": 0.9301538467407227,
+      "epoch": 0.04415823367065318,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.003834392176941037,
+      "learning_rate": 1e-05,
+      "loss": 0.0636,
+      "num_tokens": 35102738.0,
+      "reward": 0.4375,
+      "reward_std": 0.36614155769348145,
+      "rewards/accuracy_reward/mean": 0.4375,
+      "rewards/accuracy_reward/std": 0.49802759289741516,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998494386672974,
+      "sampling/importance_sampling_ratio/min": 0.00013992394087836146,
+      "sampling/sampling_logp_difference/max": 8.874411582946777,
+      "sampling/sampling_logp_difference/mean": 0.019147861748933792,
+      "step": 48
+    },
+    {
+      "clip_ratio/high_max": 1.1501961580506759e-05,
+      "clip_ratio/high_mean": 2.8754903951266897e-06,
+      "clip_ratio/low_mean": 4.08189714562468e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.369446196506033e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15594.0,
+      "completions/mean_length": 6262.46875,
+      "completions/mean_terminated_length": 5764.68798828125,
+      "completions/min_length": 210.0,
+      "completions/min_terminated_length": 210.0,
+      "entropy": 0.8599015846848488,
+      "epoch": 0.045078196872125116,
+      "frac_reward_zero_std": 0.125,
+      "grad_norm": 0.0029804729856550694,
+      "learning_rate": 1e-05,
+      "loss": 0.0495,
+      "num_tokens": 35924886.0,
+      "reward": 0.3984375,
+      "reward_std": 0.3911295533180237,
+      "rewards/accuracy_reward/mean": 0.3984375,
+      "rewards/accuracy_reward/std": 0.4915000796318054,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999922513961792,
+      "sampling/importance_sampling_ratio/min": 0.00021375219512265176,
+      "sampling/sampling_logp_difference/max": 9.904524803161621,
+      "sampling/sampling_logp_difference/mean": 0.01815103553235531,
+      "step": 49
+    },
+    {
+      "clip_ratio/high_max": 2.4107544049911667e-05,
+      "clip_ratio/high_mean": 6.026886012477917e-06,
+      "clip_ratio/low_mean": 3.6588148361715866e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.261503391944643e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 14556.0,
+      "completions/max_terminated_length": 14556.0,
+      "completions/mean_length": 5926.8984375,
+      "completions/mean_terminated_length": 5926.8984375,
+      "completions/min_length": 346.0,
+      "completions/min_terminated_length": 346.0,
+      "entropy": 1.0042993426322937,
+      "epoch": 0.045998160073597055,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0022071697749197483,
+      "learning_rate": 1e-05,
+      "loss": 0.0059,
+      "num_tokens": 36700913.0,
+      "reward": 0.3359375,
+      "reward_std": 0.3306073546409607,
+      "rewards/accuracy_reward/mean": 0.3359375,
+      "rewards/accuracy_reward/std": 0.47417303919792175,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.000010371208191,
+      "sampling/importance_sampling_ratio/min": 0.0005220364546403289,
+      "sampling/sampling_logp_difference/max": 7.557773113250732,
+      "sampling/sampling_logp_difference/mean": 0.01954064890742302,
+      "step": 50
+    },
+    {
+      "clip_ratio/high_max": 4.9106265578302555e-06,
+      "clip_ratio/high_mean": 1.2276566394575639e-06,
+      "clip_ratio/low_mean": 2.634599570683349e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.7573652346291055e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15217.0,
+      "completions/mean_length": 6873.6875,
+      "completions/mean_terminated_length": 6645.4404296875,
+      "completions/min_length": 505.0,
+      "completions/min_terminated_length": 505.0,
+      "entropy": 1.0255412608385086,
+      "epoch": 0.046918123275068994,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.002320924773812294,
+      "learning_rate": 1e-05,
+      "loss": 0.0508,
+      "num_tokens": 37604865.0,
+      "reward": 0.234375,
+      "reward_std": 0.3135228157043457,
+      "rewards/accuracy_reward/mean": 0.234375,
+      "rewards/accuracy_reward/std": 0.42527204751968384,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999098777770996,
+      "sampling/importance_sampling_ratio/min": 0.026153141632676125,
+      "sampling/sampling_logp_difference/max": 3.6437859535217285,
+      "sampling/sampling_logp_difference/mean": 0.019532475620508194,
+      "step": 51
+    },
+    {
+      "clip_ratio/high_max": 1.6350510122720152e-05,
+      "clip_ratio/high_mean": 4.087627530680038e-06,
+      "clip_ratio/low_mean": 2.351988746340794e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.7607515221461654e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15668.0,
+      "completions/mean_length": 6073.8984375,
+      "completions/mean_terminated_length": 5992.71630859375,
+      "completions/min_length": 654.0,
+      "completions/min_terminated_length": 654.0,
+      "entropy": 1.0713753998279572,
+      "epoch": 0.04783808647654094,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.002212709980085492,
+      "learning_rate": 1e-05,
+      "loss": 0.0668,
+      "num_tokens": 38405196.0,
+      "reward": 0.359375,
+      "reward_std": 0.22119548916816711,
+      "rewards/accuracy_reward/mean": 0.359375,
+      "rewards/accuracy_reward/std": 0.481702595949173,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998978972434998,
+      "sampling/importance_sampling_ratio/min": 8.706459084351081e-06,
+      "sampling/sampling_logp_difference/max": 11.651445388793945,
+      "sampling/sampling_logp_difference/mean": 0.021252838894724846,
+      "step": 52
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 3.729486718384578e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.729486718384578e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15299.0,
+      "completions/mean_length": 5838.71875,
+      "completions/mean_terminated_length": 5671.33349609375,
+      "completions/min_length": 331.0,
+      "completions/min_terminated_length": 331.0,
+      "entropy": 1.021155133843422,
+      "epoch": 0.04875804967801288,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.001135052996687591,
+      "learning_rate": 1e-05,
+      "loss": 0.0178,
+      "num_tokens": 39171704.0,
+      "reward": 0.28125,
+      "reward_std": 0.23410367965698242,
+      "rewards/accuracy_reward/mean": 0.28125,
+      "rewards/accuracy_reward/std": 0.4513758420944214,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999173879623413,
+      "sampling/importance_sampling_ratio/min": 0.003084881929680705,
+      "sampling/sampling_logp_difference/max": 5.7812418937683105,
+      "sampling/sampling_logp_difference/mean": 0.020781882107257843,
+      "step": 53
+    },
+    {
+      "clip_ratio/high_max": 1.7124169744420215e-05,
+      "clip_ratio/high_mean": 4.281042436105054e-06,
+      "clip_ratio/low_mean": 3.706903294187214e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.135007543482061e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 14617.0,
+      "completions/max_terminated_length": 14617.0,
+      "completions/mean_length": 6358.5859375,
+      "completions/mean_terminated_length": 6358.5859375,
+      "completions/min_length": 940.0,
+      "completions/min_terminated_length": 940.0,
+      "entropy": 0.9720487147569656,
+      "epoch": 0.04967801287948482,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.002638082252815366,
+      "learning_rate": 1e-05,
+      "loss": 0.0145,
+      "num_tokens": 40003859.0,
+      "reward": 0.40625,
+      "reward_std": 0.3174618184566498,
+      "rewards/accuracy_reward/mean": 0.40625,
+      "rewards/accuracy_reward/std": 0.4930621087551117,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000380277633667,
+      "sampling/importance_sampling_ratio/min": 0.01960253342986107,
+      "sampling/sampling_logp_difference/max": 3.932096481323242,
+      "sampling/sampling_logp_difference/mean": 0.01991666667163372,
+      "step": 54
+    },
+    {
+      "clip_ratio/high_max": 6.55582925901399e-06,
+      "clip_ratio/high_mean": 2.994117721755174e-06,
+      "clip_ratio/low_mean": 2.222621503733535e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.5220332759090525e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 14753.0,
+      "completions/max_terminated_length": 14753.0,
+      "completions/mean_length": 4634.1875,
+      "completions/mean_terminated_length": 4634.1875,
+      "completions/min_length": 266.0,
+      "completions/min_terminated_length": 266.0,
+      "entropy": 0.9715309366583824,
+      "epoch": 0.050597976080956765,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.001994960242882371,
+      "learning_rate": 1e-05,
+      "loss": 0.0221,
+      "num_tokens": 40616483.0,
+      "reward": 0.4375,
+      "reward_std": 0.29644322395324707,
+      "rewards/accuracy_reward/mean": 0.4375,
+      "rewards/accuracy_reward/std": 0.49802759289741516,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000698566436768,
+      "sampling/importance_sampling_ratio/min": 1.0510009815334342e-05,
+      "sampling/sampling_logp_difference/max": 11.46318244934082,
+      "sampling/sampling_logp_difference/mean": 0.01902047172188759,
+      "step": 55
+    },
+    {
+      "clip_ratio/high_max": 2.2474248908110894e-05,
+      "clip_ratio/high_mean": 7.571314540655294e-06,
+      "clip_ratio/low_mean": 4.3583780325207044e-05,
+      "clip_ratio/low_min": 4.6013396968191955e-06,
+      "clip_ratio/region_mean": 5.1155094070054474e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15953.0,
+      "completions/mean_length": 6596.25,
+      "completions/mean_terminated_length": 6361.34423828125,
+      "completions/min_length": 318.0,
+      "completions/min_terminated_length": 318.0,
+      "entropy": 0.8207943215966225,
+      "epoch": 0.051517939282428704,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0019902780186384916,
+      "learning_rate": 1e-05,
+      "loss": 0.0506,
+      "num_tokens": 41484443.0,
+      "reward": 0.4453125,
+      "reward_std": 0.326668381690979,
+      "rewards/accuracy_reward/mean": 0.4453125,
+      "rewards/accuracy_reward/std": 0.4989531338214874,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.000016689300537,
+      "sampling/importance_sampling_ratio/min": 7.485233072657138e-05,
+      "sampling/sampling_logp_difference/max": 9.499993324279785,
+      "sampling/sampling_logp_difference/mean": 0.018301833420991898,
+      "step": 56
+    },
+    {
+      "clip_ratio/high_max": 3.0019932637515012e-06,
+      "clip_ratio/high_mean": 7.504983159378753e-07,
+      "clip_ratio/low_mean": 4.332785601945943e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.407835376696312e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15834.0,
+      "completions/mean_length": 6785.75,
+      "completions/mean_terminated_length": 6313.70458984375,
+      "completions/min_length": 393.0,
+      "completions/min_terminated_length": 393.0,
+      "entropy": 0.9876058474183083,
+      "epoch": 0.05243790248390064,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0015235114842653275,
+      "learning_rate": 1e-05,
+      "loss": 0.0128,
+      "num_tokens": 42372235.0,
+      "reward": 0.2421875,
+      "reward_std": 0.325075626373291,
+      "rewards/accuracy_reward/mean": 0.2421875,
+      "rewards/accuracy_reward/std": 0.4300905168056488,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999551773071289,
+      "sampling/importance_sampling_ratio/min": 0.026679370552301407,
+      "sampling/sampling_logp_difference/max": 3.6238646507263184,
+      "sampling/sampling_logp_difference/mean": 0.019945615902543068,
+      "step": 57
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 2.1349006601667497e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.1349006601667497e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14726.0,
+      "completions/mean_length": 4881.2109375,
+      "completions/mean_terminated_length": 4510.1533203125,
+      "completions/min_length": 437.0,
+      "completions/min_terminated_length": 437.0,
+      "entropy": 0.989942155778408,
+      "epoch": 0.05335786568537258,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.002033712575212121,
+      "learning_rate": 1e-05,
+      "loss": 0.1088,
+      "num_tokens": 43015238.0,
+      "reward": 0.4375,
+      "reward_std": 0.2869548797607422,
+      "rewards/accuracy_reward/mean": 0.4375,
+      "rewards/accuracy_reward/std": 0.49802759289741516,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000300407409668,
+      "sampling/importance_sampling_ratio/min": 0.0001238943514181301,
+      "sampling/sampling_logp_difference/max": 8.996081352233887,
+      "sampling/sampling_logp_difference/mean": 0.01887543685734272,
+      "step": 58
+    },
+    {
+      "clip_ratio/high_max": 2.584004687378183e-05,
+      "clip_ratio/high_mean": 6.4600117184454575e-06,
+      "clip_ratio/low_mean": 2.1371045761497953e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.7831058105221018e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15001.0,
+      "completions/max_terminated_length": 15001.0,
+      "completions/mean_length": 4725.3984375,
+      "completions/mean_terminated_length": 4725.3984375,
+      "completions/min_length": 117.0,
+      "completions/min_terminated_length": 117.0,
+      "entropy": 1.0350637435913086,
+      "epoch": 0.05427782888684453,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0030296226032078266,
+      "learning_rate": 1e-05,
+      "loss": 0.0691,
+      "num_tokens": 43637737.0,
+      "reward": 0.4453125,
+      "reward_std": 0.32035762071609497,
+      "rewards/accuracy_reward/mean": 0.4453125,
+      "rewards/accuracy_reward/std": 0.4989531338214874,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999939203262329,
+      "sampling/importance_sampling_ratio/min": 0.00022932067804504186,
+      "sampling/sampling_logp_difference/max": 8.380389213562012,
+      "sampling/sampling_logp_difference/mean": 0.01995944231748581,
+      "step": 59
+    },
+    {
+      "clip_ratio/high_max": 1.994733975152485e-05,
+      "clip_ratio/high_mean": 4.986834937881213e-06,
+      "clip_ratio/low_mean": 3.5168303838872816e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.015513832200668e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16240.0,
+      "completions/mean_length": 4918.171875,
+      "completions/mean_terminated_length": 4736.1748046875,
+      "completions/min_length": 388.0,
+      "completions/min_terminated_length": 388.0,
+      "entropy": 0.965274304151535,
+      "epoch": 0.05519779208831647,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.002758471528068185,
+      "learning_rate": 1e-05,
+      "loss": 0.0845,
+      "num_tokens": 44285327.0,
+      "reward": 0.328125,
+      "reward_std": 0.27328526973724365,
+      "rewards/accuracy_reward/mean": 0.328125,
+      "rewards/accuracy_reward/std": 0.4713755249977112,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999663233757019,
+      "sampling/importance_sampling_ratio/min": 0.010958661325275898,
+      "sampling/sampling_logp_difference/max": 4.513625144958496,
+      "sampling/sampling_logp_difference/mean": 0.019083233550190926,
+      "step": 60
+    },
+    {
+      "clip_ratio/high_max": 1.0621563887980301e-05,
+      "clip_ratio/high_mean": 2.6553909719950752e-06,
+      "clip_ratio/low_mean": 3.838553107016196e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.1040922042157035e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15031.0,
+      "completions/mean_length": 4998.2890625,
+      "completions/mean_terminated_length": 4908.6376953125,
+      "completions/min_length": 524.0,
+      "completions/min_terminated_length": 524.0,
+      "entropy": 0.9200445115566254,
+      "epoch": 0.05611775528978841,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.0027611786499619484,
+      "learning_rate": 1e-05,
+      "loss": 0.0575,
+      "num_tokens": 44944356.0,
+      "reward": 0.3515625,
+      "reward_std": 0.3895368278026581,
+      "rewards/accuracy_reward/mean": 0.3515625,
+      "rewards/accuracy_reward/std": 0.4793342351913452,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999884366989136,
+      "sampling/importance_sampling_ratio/min": 0.0018651526188477874,
+      "sampling/sampling_logp_difference/max": 6.284412384033203,
+      "sampling/sampling_logp_difference/mean": 0.017853498458862305,
+      "step": 61
+    },
+    {
+      "clip_ratio/high_max": 1.0136624496226432e-05,
+      "clip_ratio/high_mean": 2.534156124056608e-06,
+      "clip_ratio/low_mean": 2.0260404085092887e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.2794560095462657e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16110.0,
+      "completions/mean_length": 6290.1796875,
+      "completions/mean_terminated_length": 6129.96044921875,
+      "completions/min_length": 302.0,
+      "completions/min_terminated_length": 302.0,
+      "entropy": 0.9360214695334435,
+      "epoch": 0.05703771849126035,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0015557854203507304,
+      "learning_rate": 1e-05,
+      "loss": 0.0111,
+      "num_tokens": 45767867.0,
+      "reward": 0.34375,
+      "reward_std": 0.30168038606643677,
+      "rewards/accuracy_reward/mean": 0.34375,
+      "rewards/accuracy_reward/std": 0.47682511806488037,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999427795410156,
+      "sampling/importance_sampling_ratio/min": 0.0011004531988874078,
+      "sampling/sampling_logp_difference/max": 6.812033176422119,
+      "sampling/sampling_logp_difference/mean": 0.0200855303555727,
+      "step": 62
+    },
+    {
+      "clip_ratio/high_max": 2.2559511307918e-06,
+      "clip_ratio/high_mean": 5.6398778269795e-07,
+      "clip_ratio/low_mean": 4.51761221711422e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.574010984015331e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16366.0,
+      "completions/mean_length": 6486.15625,
+      "completions/mean_terminated_length": 6248.6083984375,
+      "completions/min_length": 77.0,
+      "completions/min_terminated_length": 77.0,
+      "entropy": 0.863138921558857,
+      "epoch": 0.05795768169273229,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0026953541673719883,
+      "learning_rate": 1e-05,
+      "loss": -0.0194,
+      "num_tokens": 46618575.0,
+      "reward": 0.2578125,
+      "reward_std": 0.2580180764198303,
+      "rewards/accuracy_reward/mean": 0.2578125,
+      "rewards/accuracy_reward/std": 0.43914902210235596,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999406337738037,
+      "sampling/importance_sampling_ratio/min": 0.0011708897072821856,
+      "sampling/sampling_logp_difference/max": 6.749991416931152,
+      "sampling/sampling_logp_difference/mean": 0.01863238587975502,
+      "step": 63
+    },
+    {
+      "clip_ratio/high_max": 1.0073357771034352e-05,
+      "clip_ratio/high_mean": 2.518339442758588e-06,
+      "clip_ratio/low_mean": 2.787370635815023e-05,
+      "clip_ratio/low_min": 3.837534222839167e-06,
+      "clip_ratio/region_mean": 3.0392045573535142e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16010.0,
+      "completions/mean_length": 6442.7734375,
+      "completions/mean_terminated_length": 6284.9765625,
+      "completions/min_length": 776.0,
+      "completions/min_terminated_length": 776.0,
+      "entropy": 1.0242054909467697,
+      "epoch": 0.05887764489420423,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0024442619178444147,
+      "learning_rate": 1e-05,
+      "loss": 0.0569,
+      "num_tokens": 47462274.0,
+      "reward": 0.328125,
+      "reward_std": 0.2777610421180725,
+      "rewards/accuracy_reward/mean": 0.328125,
+      "rewards/accuracy_reward/std": 0.4713755249977112,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998892545700073,
+      "sampling/importance_sampling_ratio/min": 4.9445447736218284e-09,
+      "sampling/sampling_logp_difference/max": 19.124980926513672,
+      "sampling/sampling_logp_difference/mean": 0.019810764119029045,
+      "step": 64
+    },
+    {
+      "clip_ratio/high_max": 1.220810372615233e-05,
+      "clip_ratio/high_mean": 3.0520259315380827e-06,
+      "clip_ratio/low_mean": 4.339240456374682e-05,
+      "clip_ratio/low_min": 4.491233084991109e-06,
+      "clip_ratio/region_mean": 4.644443038159807e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15624.0,
+      "completions/mean_length": 4807.765625,
+      "completions/mean_terminated_length": 4716.6142578125,
+      "completions/min_length": 272.0,
+      "completions/min_terminated_length": 272.0,
+      "entropy": 1.045751042664051,
+      "epoch": 0.05979760809567617,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.002512057079002261,
+      "learning_rate": 1e-05,
+      "loss": 0.003,
+      "num_tokens": 48096692.0,
+      "reward": 0.3671875,
+      "reward_std": 0.3435155153274536,
+      "rewards/accuracy_reward/mean": 0.3671875,
+      "rewards/accuracy_reward/std": 0.4839322865009308,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999058842658997,
+      "sampling/importance_sampling_ratio/min": 1.1480136890895665e-05,
+      "sampling/sampling_logp_difference/max": 11.374892234802246,
+      "sampling/sampling_logp_difference/mean": 0.01960371434688568,
+      "step": 65
+    },
+    {
+      "clip_ratio/high_max": 5.37941218681226e-06,
+      "clip_ratio/high_mean": 1.344853046703065e-06,
+      "clip_ratio/low_mean": 3.0161771633174794e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.1506624850408116e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16264.0,
+      "completions/mean_length": 6703.8359375,
+      "completions/mean_terminated_length": 6471.51220703125,
+      "completions/min_length": 813.0,
+      "completions/min_terminated_length": 813.0,
+      "entropy": 1.0592866837978363,
+      "epoch": 0.06071757129714812,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0016389708034694195,
+      "learning_rate": 1e-05,
+      "loss": -0.024,
+      "num_tokens": 48974399.0,
+      "reward": 0.2734375,
+      "reward_std": 0.2585548758506775,
+      "rewards/accuracy_reward/mean": 0.2734375,
+      "rewards/accuracy_reward/std": 0.447474867105484,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999353885650635,
+      "sampling/importance_sampling_ratio/min": 7.4113349910476245e-06,
+      "sampling/sampling_logp_difference/max": 11.8125,
+      "sampling/sampling_logp_difference/mean": 0.020880095660686493,
+      "step": 66
+    },
+    {
+      "clip_ratio/high_max": 7.093600515872822e-06,
+      "clip_ratio/high_mean": 1.7734001289682055e-06,
+      "clip_ratio/low_mean": 4.470584758564655e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.647924811251869e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16295.0,
+      "completions/mean_length": 6140.5078125,
+      "completions/mean_terminated_length": 5724.10546875,
+      "completions/min_length": 462.0,
+      "completions/min_terminated_length": 462.0,
+      "entropy": 1.0998501181602478,
+      "epoch": 0.061637534498620056,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.003946912474930286,
+      "learning_rate": 1e-05,
+      "loss": 0.0448,
+      "num_tokens": 49779920.0,
+      "reward": 0.34375,
+      "reward_std": 0.36796674132347107,
+      "rewards/accuracy_reward/mean": 0.34375,
+      "rewards/accuracy_reward/std": 0.47682511806488037,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999687671661377,
+      "sampling/importance_sampling_ratio/min": 2.849436668839189e-07,
+      "sampling/sampling_logp_difference/max": 15.070974349975586,
+      "sampling/sampling_logp_difference/mean": 0.021355850622057915,
+      "step": 67
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 3.313956779038563e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.313956779038563e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16352.0,
+      "completions/mean_length": 6689.8046875,
+      "completions/mean_terminated_length": 6213.04052734375,
+      "completions/min_length": 422.0,
+      "completions/min_terminated_length": 422.0,
+      "entropy": 0.8561654165387154,
+      "epoch": 0.062557497700092,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.0021656695753335953,
+      "learning_rate": 1e-05,
+      "loss": 0.0283,
+      "num_tokens": 50655023.0,
+      "reward": 0.203125,
+      "reward_std": 0.21723884344100952,
+      "rewards/accuracy_reward/mean": 0.203125,
+      "rewards/accuracy_reward/std": 0.40390563011169434,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999941885471344,
+      "sampling/importance_sampling_ratio/min": 2.836359499269747e-06,
+      "sampling/sampling_logp_difference/max": 12.772989273071289,
+      "sampling/sampling_logp_difference/mean": 0.01873670145869255,
+      "step": 68
+    },
+    {
+      "clip_ratio/high_max": 2.3421607693308033e-05,
+      "clip_ratio/high_mean": 7.242933975248889e-06,
+      "clip_ratio/low_mean": 3.896083626386826e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.620377103492501e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 14330.0,
+      "completions/max_terminated_length": 14330.0,
+      "completions/mean_length": 5707.0078125,
+      "completions/mean_terminated_length": 5707.0078125,
+      "completions/min_length": 625.0,
+      "completions/min_terminated_length": 625.0,
+      "entropy": 1.1396166533231735,
+      "epoch": 0.06347746090156393,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.004121148493140936,
+      "learning_rate": 1e-05,
+      "loss": 0.0397,
+      "num_tokens": 51406536.0,
+      "reward": 0.3125,
+      "reward_std": 0.3066929578781128,
+      "rewards/accuracy_reward/mean": 0.3125,
+      "rewards/accuracy_reward/std": 0.4653336703777313,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999328851699829,
+      "sampling/importance_sampling_ratio/min": 0.0005196487763896585,
+      "sampling/sampling_logp_difference/max": 7.562357425689697,
+      "sampling/sampling_logp_difference/mean": 0.020000409334897995,
+      "step": 69
+    },
+    {
+      "clip_ratio/high_max": 1.82290532393381e-05,
+      "clip_ratio/high_mean": 4.557263309834525e-06,
+      "clip_ratio/low_mean": 2.5275351731579576e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.9832615496161452e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15987.0,
+      "completions/mean_length": 5655.6328125,
+      "completions/mean_terminated_length": 5571.1572265625,
+      "completions/min_length": 157.0,
+      "completions/min_terminated_length": 157.0,
+      "entropy": 0.8928132206201553,
+      "epoch": 0.06439742410303588,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0032538517843931913,
+      "learning_rate": 1e-05,
+      "loss": 0.0627,
+      "num_tokens": 52148473.0,
+      "reward": 0.3984375,
+      "reward_std": 0.29432642459869385,
+      "rewards/accuracy_reward/mean": 0.3984375,
+      "rewards/accuracy_reward/std": 0.4915000796318054,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000033378601074,
+      "sampling/importance_sampling_ratio/min": 0.0017573959194123745,
+      "sampling/sampling_logp_difference/max": 6.343922138214111,
+      "sampling/sampling_logp_difference/mean": 0.018881790339946747,
+      "step": 70
+    },
+    {
+      "clip_ratio/high_max": 1.2836022506235167e-05,
+      "clip_ratio/high_mean": 3.209005626558792e-06,
+      "clip_ratio/low_mean": 3.8109637216621195e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.131864307055366e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16323.0,
+      "completions/mean_length": 7399.7890625,
+      "completions/mean_terminated_length": 7034.5771484375,
+      "completions/min_length": 144.0,
+      "completions/min_terminated_length": 144.0,
+      "entropy": 0.8808257132768631,
+      "epoch": 0.06531738730450783,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.002061733277514577,
+      "learning_rate": 1e-05,
+      "loss": 0.0191,
+      "num_tokens": 53113230.0,
+      "reward": 0.3046875,
+      "reward_std": 0.2580180764198303,
+      "rewards/accuracy_reward/mean": 0.3046875,
+      "rewards/accuracy_reward/std": 0.46208351850509644,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999673962593079,
+      "sampling/importance_sampling_ratio/min": 0.005283349193632603,
+      "sampling/sampling_logp_difference/max": 5.243195056915283,
+      "sampling/sampling_logp_difference/mean": 0.018456293269991875,
+      "step": 71
+    },
+    {
+      "clip_ratio/high_max": 1.5806871488166507e-05,
+      "clip_ratio/high_mean": 4.739466817227367e-06,
+      "clip_ratio/low_mean": 3.610486896832299e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.084433521711617e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16208.0,
+      "completions/mean_length": 5730.9609375,
+      "completions/mean_terminated_length": 5475.2880859375,
+      "completions/min_length": 433.0,
+      "completions/min_terminated_length": 433.0,
+      "entropy": 0.9486126750707626,
+      "epoch": 0.06623735050597976,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0012298432411625981,
+      "learning_rate": 1e-05,
+      "loss": 0.0208,
+      "num_tokens": 53864049.0,
+      "reward": 0.359375,
+      "reward_std": 0.25460314750671387,
+      "rewards/accuracy_reward/mean": 0.359375,
+      "rewards/accuracy_reward/std": 0.481702595949173,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999348521232605,
+      "sampling/importance_sampling_ratio/min": 4.832820559386164e-05,
+      "sampling/sampling_logp_difference/max": 9.937495231628418,
+      "sampling/sampling_logp_difference/mean": 0.01919996738433838,
+      "step": 72
+    },
+    {
+      "clip_ratio/high_max": 1.2390134997986024e-05,
+      "clip_ratio/high_mean": 3.097533749496506e-06,
+      "clip_ratio/low_mean": 3.8867822581778455e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.19653564449618e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 13500.0,
+      "completions/mean_length": 4620.5703125,
+      "completions/mean_terminated_length": 4527.94482421875,
+      "completions/min_length": 364.0,
+      "completions/min_terminated_length": 364.0,
+      "entropy": 0.9557560831308365,
+      "epoch": 0.0671573137074517,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.002882040338590741,
+      "learning_rate": 1e-05,
+      "loss": 0.0609,
+      "num_tokens": 54473498.0,
+      "reward": 0.3984375,
+      "reward_std": 0.39294686913490295,
+      "rewards/accuracy_reward/mean": 0.3984375,
+      "rewards/accuracy_reward/std": 0.4915000796318054,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998915195465088,
+      "sampling/importance_sampling_ratio/min": 1.577107298089686e-07,
+      "sampling/sampling_logp_difference/max": 15.662503242492676,
+      "sampling/sampling_logp_difference/mean": 0.018525000661611557,
+      "step": 73
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 3.088819471486204e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.088819471486204e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 16314.0,
+      "completions/max_terminated_length": 16314.0,
+      "completions/mean_length": 5074.0703125,
+      "completions/mean_terminated_length": 5074.0703125,
+      "completions/min_length": 342.0,
+      "completions/min_terminated_length": 342.0,
+      "entropy": 0.8830869868397713,
+      "epoch": 0.06807727690892364,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.003324020653963089,
+      "learning_rate": 1e-05,
+      "loss": 0.0305,
+      "num_tokens": 55141787.0,
+      "reward": 0.4609375,
+      "reward_std": 0.30115634202957153,
+      "rewards/accuracy_reward/mean": 0.4609375,
+      "rewards/accuracy_reward/std": 0.5004304051399231,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999203681945801,
+      "sampling/importance_sampling_ratio/min": 0.0009876838885247707,
+      "sampling/sampling_logp_difference/max": 6.920147895812988,
+      "sampling/sampling_logp_difference/mean": 0.018072880804538727,
+      "step": 74
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 1.526649884908693e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 1.526649884908693e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15251.0,
+      "completions/max_terminated_length": 15251.0,
+      "completions/mean_length": 6192.1015625,
+      "completions/mean_terminated_length": 6192.1015625,
+      "completions/min_length": 553.0,
+      "completions/min_terminated_length": 553.0,
+      "entropy": 1.0888547226786613,
+      "epoch": 0.06899724011039558,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0017452294705435634,
+      "learning_rate": 1e-05,
+      "loss": 0.0216,
+      "num_tokens": 55954144.0,
+      "reward": 0.2890625,
+      "reward_std": 0.23250606656074524,
+      "rewards/accuracy_reward/mean": 0.2890625,
+      "rewards/accuracy_reward/std": 0.45510825514793396,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999473690986633,
+      "sampling/importance_sampling_ratio/min": 5.061922365712235e-07,
+      "sampling/sampling_logp_difference/max": 14.496349334716797,
+      "sampling/sampling_logp_difference/mean": 0.021221645176410675,
+      "step": 75
+    },
+    {
+      "clip_ratio/high_max": 1.6768677141953958e-05,
+      "clip_ratio/high_mean": 5.080836899651331e-06,
+      "clip_ratio/low_mean": 3.340929970363504e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.84901372854074e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15740.0,
+      "completions/mean_length": 6204.296875,
+      "completions/mean_terminated_length": 6124.1416015625,
+      "completions/min_length": 294.0,
+      "completions/min_terminated_length": 294.0,
+      "entropy": 1.0423575639724731,
+      "epoch": 0.06991720331186753,
+      "frac_reward_zero_std": 0.125,
+      "grad_norm": 0.0033357341308146715,
+      "learning_rate": 1e-05,
+      "loss": 0.1073,
+      "num_tokens": 56765470.0,
+      "reward": 0.3359375,
+      "reward_std": 0.37875816226005554,
+      "rewards/accuracy_reward/mean": 0.3359375,
+      "rewards/accuracy_reward/std": 0.47417303919792175,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.99998539686203,
+      "sampling/importance_sampling_ratio/min": 4.564182381727733e-05,
+      "sampling/sampling_logp_difference/max": 9.994686126708984,
+      "sampling/sampling_logp_difference/mean": 0.01908688060939312,
+      "step": 76
+    },
+    {
+      "clip_ratio/high_max": 3.149884150843718e-06,
+      "clip_ratio/high_mean": 7.874710377109295e-07,
+      "clip_ratio/low_mean": 2.430614893000893e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.509361991087644e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 14409.0,
+      "completions/max_terminated_length": 14409.0,
+      "completions/mean_length": 5070.3125,
+      "completions/mean_terminated_length": 5070.3125,
+      "completions/min_length": 629.0,
+      "completions/min_terminated_length": 629.0,
+      "entropy": 1.0737399458885193,
+      "epoch": 0.07083716651333946,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0038695367984473705,
+      "learning_rate": 1e-05,
+      "loss": 0.0015,
+      "num_tokens": 57432958.0,
+      "reward": 0.390625,
+      "reward_std": 0.22119548916816711,
+      "rewards/accuracy_reward/mean": 0.390625,
+      "rewards/accuracy_reward/std": 0.4898075461387634,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999223947525024,
+      "sampling/importance_sampling_ratio/min": 1.5509348259001854e-06,
+      "sampling/sampling_logp_difference/max": 13.376652717590332,
+      "sampling/sampling_logp_difference/mean": 0.01970684342086315,
+      "step": 77
+    },
+    {
+      "clip_ratio/high_max": 1.9821940441033803e-05,
+      "clip_ratio/high_mean": 4.955485110258451e-06,
+      "clip_ratio/low_mean": 2.9055729555693688e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.401121466595214e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15799.0,
+      "completions/mean_length": 5750.21875,
+      "completions/mean_terminated_length": 5495.00830078125,
+      "completions/min_length": 76.0,
+      "completions/min_terminated_length": 76.0,
+      "entropy": 0.9708107560873032,
+      "epoch": 0.07175712971481141,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.002927646040916443,
+      "learning_rate": 1e-05,
+      "loss": 0.0166,
+      "num_tokens": 58187426.0,
+      "reward": 0.296875,
+      "reward_std": 0.25460314750671387,
+      "rewards/accuracy_reward/mean": 0.296875,
+      "rewards/accuracy_reward/std": 0.45867621898651123,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999390840530396,
+      "sampling/importance_sampling_ratio/min": 0.015204614959657192,
+      "sampling/sampling_logp_difference/max": 4.186156272888184,
+      "sampling/sampling_logp_difference/mean": 0.019483914598822594,
+      "step": 78
+    },
+    {
+      "clip_ratio/high_max": 2.3815636723156786e-05,
+      "clip_ratio/high_mean": 5.953909180789196e-06,
+      "clip_ratio/low_mean": 4.989707144886779e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.585097960647545e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15938.0,
+      "completions/mean_length": 6067.484375,
+      "completions/mean_terminated_length": 5986.251953125,
+      "completions/min_length": 656.0,
+      "completions/min_terminated_length": 656.0,
+      "entropy": 0.9576351121068001,
+      "epoch": 0.07267709291628335,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.0026169484481215477,
+      "learning_rate": 1e-05,
+      "loss": -0.0055,
+      "num_tokens": 58983336.0,
+      "reward": 0.390625,
+      "reward_std": 0.3406373858451843,
+      "rewards/accuracy_reward/mean": 0.390625,
+      "rewards/accuracy_reward/std": 0.4898075461387634,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999620914459229,
+      "sampling/importance_sampling_ratio/min": 1.974713995878119e-06,
+      "sampling/sampling_logp_difference/max": 13.135087013244629,
+      "sampling/sampling_logp_difference/mean": 0.019007554277777672,
+      "step": 79
+    },
+    {
+      "clip_ratio/high_max": 2.4238934656750644e-05,
+      "clip_ratio/high_mean": 7.786730066072778e-06,
+      "clip_ratio/low_mean": 4.5700241571466904e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.3486972547034384e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 13640.0,
+      "completions/max_terminated_length": 13640.0,
+      "completions/mean_length": 4612.8984375,
+      "completions/mean_terminated_length": 4612.8984375,
+      "completions/min_length": 198.0,
+      "completions/min_terminated_length": 198.0,
+      "entropy": 0.9636320173740387,
+      "epoch": 0.07359705611775529,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0015429699560627341,
+      "learning_rate": 1e-05,
+      "loss": -0.018,
+      "num_tokens": 59590763.0,
+      "reward": 0.421875,
+      "reward_std": 0.34139877557754517,
+      "rewards/accuracy_reward/mean": 0.421875,
+      "rewards/accuracy_reward/std": 0.4957992732524872,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999473094940186,
+      "sampling/importance_sampling_ratio/min": 2.5909587364481013e-08,
+      "sampling/sampling_logp_difference/max": 17.468652725219727,
+      "sampling/sampling_logp_difference/mean": 0.019313856959342957,
+      "step": 80
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 3.0911465842109465e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.0911465842109465e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16300.0,
+      "completions/mean_length": 6101.3125,
+      "completions/mean_terminated_length": 5854.5283203125,
+      "completions/min_length": 179.0,
+      "completions/min_terminated_length": 179.0,
+      "entropy": 0.8831139355897903,
+      "epoch": 0.07451701931922723,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0022505265660583973,
+      "learning_rate": 1e-05,
+      "loss": 0.0813,
+      "num_tokens": 60391283.0,
+      "reward": 0.3125,
+      "reward_std": 0.29302334785461426,
+      "rewards/accuracy_reward/mean": 0.3125,
+      "rewards/accuracy_reward/std": 0.4653336703777313,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000061988830566,
+      "sampling/importance_sampling_ratio/min": 0.0003816343960352242,
+      "sampling/sampling_logp_difference/max": 7.871047496795654,
+      "sampling/sampling_logp_difference/mean": 0.018377842381596565,
+      "step": 81
+    },
+    {
+      "clip_ratio/high_max": 1.547606643725885e-05,
+      "clip_ratio/high_mean": 3.869016609314713e-06,
+      "clip_ratio/low_mean": 2.478705800967873e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.8656074391619768e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14862.0,
+      "completions/mean_length": 4705.9921875,
+      "completions/mean_terminated_length": 4614.03955078125,
+      "completions/min_length": 86.0,
+      "completions/min_terminated_length": 86.0,
+      "entropy": 0.9557913094758987,
+      "epoch": 0.07543698252069918,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.002069958718493581,
+      "learning_rate": 1e-05,
+      "loss": -0.0015,
+      "num_tokens": 61021490.0,
+      "reward": 0.4296875,
+      "reward_std": 0.2637920379638672,
+      "rewards/accuracy_reward/mean": 0.4296875,
+      "rewards/accuracy_reward/std": 0.4969765841960907,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999030232429504,
+      "sampling/importance_sampling_ratio/min": 2.76673017651774e-05,
+      "sampling/sampling_logp_difference/max": 10.495259284973145,
+      "sampling/sampling_logp_difference/mean": 0.018629569560289383,
+      "step": 82
+    },
+    {
+      "clip_ratio/high_max": 2.0910484636260662e-05,
+      "clip_ratio/high_mean": 5.2276211590651656e-06,
+      "clip_ratio/low_mean": 1.952954164607945e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.4757162805144617e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 13745.0,
+      "completions/max_terminated_length": 13745.0,
+      "completions/mean_length": 5116.78125,
+      "completions/mean_terminated_length": 5116.78125,
+      "completions/min_length": 12.0,
+      "completions/min_terminated_length": 12.0,
+      "entropy": 1.0198405236005783,
+      "epoch": 0.07635694572217111,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0034461067989468575,
+      "learning_rate": 1e-05,
+      "loss": -0.0073,
+      "num_tokens": 61695382.0,
+      "reward": 0.265625,
+      "reward_std": 0.30774885416030884,
+      "rewards/accuracy_reward/mean": 0.265625,
+      "rewards/accuracy_reward/std": 0.44340085983276367,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999936819076538,
+      "sampling/importance_sampling_ratio/min": 0.012227212078869343,
+      "sampling/sampling_logp_difference/max": 4.4040913581848145,
+      "sampling/sampling_logp_difference/mean": 0.019400250166654587,
+      "step": 83
+    },
+    {
+      "clip_ratio/high_max": 1.5340228401328204e-05,
+      "clip_ratio/high_mean": 3.835057100332051e-06,
+      "clip_ratio/low_mean": 3.150914017169271e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.534419727202476e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15953.0,
+      "completions/mean_length": 5891.9140625,
+      "completions/mean_terminated_length": 5553.45947265625,
+      "completions/min_length": 79.0,
+      "completions/min_terminated_length": 79.0,
+      "entropy": 0.9568078517913818,
+      "epoch": 0.07727690892364306,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0025854657869786024,
+      "learning_rate": 1e-05,
+      "loss": 0.1013,
+      "num_tokens": 62474883.0,
+      "reward": 0.3203125,
+      "reward_std": 0.31010788679122925,
+      "rewards/accuracy_reward/mean": 0.3203125,
+      "rewards/accuracy_reward/std": 0.4684300124645233,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0001013278961182,
+      "sampling/importance_sampling_ratio/min": 0.0015072470996528864,
+      "sampling/sampling_logp_difference/max": 6.497470378875732,
+      "sampling/sampling_logp_difference/mean": 0.019574139267206192,
+      "step": 84
+    },
+    {
+      "clip_ratio/high_max": 1.108303422370227e-05,
+      "clip_ratio/high_mean": 2.7707585559255676e-06,
+      "clip_ratio/low_mean": 2.2325777763398946e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.5096536319324514e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 13671.0,
+      "completions/mean_length": 5300.3359375,
+      "completions/mean_terminated_length": 5213.06298828125,
+      "completions/min_length": 17.0,
+      "completions/min_terminated_length": 17.0,
+      "entropy": 0.9722280204296112,
+      "epoch": 0.078196872125115,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0025075653102248907,
+      "learning_rate": 1e-05,
+      "loss": 0.0312,
+      "num_tokens": 63172454.0,
+      "reward": 0.203125,
+      "reward_std": 0.21542152762413025,
+      "rewards/accuracy_reward/mean": 0.203125,
+      "rewards/accuracy_reward/std": 0.40390563011169434,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999643564224243,
+      "sampling/importance_sampling_ratio/min": 0.00020346972451079637,
+      "sampling/sampling_logp_difference/max": 8.499993324279785,
+      "sampling/sampling_logp_difference/mean": 0.02002432942390442,
+      "step": 85
+    },
+    {
+      "clip_ratio/high_max": 1.3991947980684927e-05,
+      "clip_ratio/high_mean": 3.4979869951712317e-06,
+      "clip_ratio/low_mean": 4.893367201930232e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.243165958290774e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15617.0,
+      "completions/mean_length": 6364.21875,
+      "completions/mean_terminated_length": 6205.1748046875,
+      "completions/min_length": 215.0,
+      "completions/min_terminated_length": 215.0,
+      "entropy": 1.0607495978474617,
+      "epoch": 0.07911683532658693,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0017982006538659334,
+      "learning_rate": 1e-05,
+      "loss": -0.0117,
+      "num_tokens": 64007602.0,
+      "reward": 0.2890625,
+      "reward_std": 0.21778053045272827,
+      "rewards/accuracy_reward/mean": 0.2890625,
+      "rewards/accuracy_reward/std": 0.45510825514793396,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000064373016357,
+      "sampling/importance_sampling_ratio/min": 3.823801307589747e-05,
+      "sampling/sampling_logp_difference/max": 10.171680450439453,
+      "sampling/sampling_logp_difference/mean": 0.020373597741127014,
+      "step": 86
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 2.6416430046083406e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.6416430046083406e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14709.0,
+      "completions/mean_length": 5746.3125,
+      "completions/mean_terminated_length": 5403.1611328125,
+      "completions/min_length": 287.0,
+      "completions/min_terminated_length": 287.0,
+      "entropy": 0.9913106113672256,
+      "epoch": 0.08003679852805888,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.002207317156717181,
+      "learning_rate": 1e-05,
+      "loss": 0.063,
+      "num_tokens": 64762058.0,
+      "reward": 0.34375,
+      "reward_std": 0.3264310359954834,
+      "rewards/accuracy_reward/mean": 0.34375,
+      "rewards/accuracy_reward/std": 0.47682511806488037,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999239444732666,
+      "sampling/importance_sampling_ratio/min": 5.3444750847120304e-08,
+      "sampling/sampling_logp_difference/max": 16.744617462158203,
+      "sampling/sampling_logp_difference/mean": 0.020608089864253998,
+      "step": 87
+    },
+    {
+      "clip_ratio/high_max": 1.2681661701208213e-05,
+      "clip_ratio/high_mean": 3.1704154253020533e-06,
+      "clip_ratio/low_mean": 3.541917828897567e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.85895939416514e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16292.0,
+      "completions/mean_length": 6088.5625,
+      "completions/mean_terminated_length": 5841.47216796875,
+      "completions/min_length": 159.0,
+      "completions/min_terminated_length": 159.0,
+      "entropy": 0.9040444120764732,
+      "epoch": 0.08095676172953081,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0012974507408216596,
+      "learning_rate": 1e-05,
+      "loss": 0.0401,
+      "num_tokens": 65561002.0,
+      "reward": 0.3671875,
+      "reward_std": 0.2477683573961258,
+      "rewards/accuracy_reward/mean": 0.3671875,
+      "rewards/accuracy_reward/std": 0.4839322865009308,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998487234115601,
+      "sampling/importance_sampling_ratio/min": 6.021501121722395e-06,
+      "sampling/sampling_logp_difference/max": 12.020174026489258,
+      "sampling/sampling_logp_difference/mean": 0.01939838007092476,
+      "step": 88
+    },
+    {
+      "clip_ratio/high_max": 7.807132533343975e-06,
+      "clip_ratio/high_mean": 1.9517831333359936e-06,
+      "clip_ratio/low_mean": 1.8564539345788944e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.05163223654381e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15021.0,
+      "completions/mean_length": 5765.5,
+      "completions/mean_terminated_length": 5510.65625,
+      "completions/min_length": 412.0,
+      "completions/min_terminated_length": 412.0,
+      "entropy": 0.9966336265206337,
+      "epoch": 0.08187672493100276,
+      "frac_reward_zero_std": 0.6875,
+      "grad_norm": 0.0013380619930103421,
+      "learning_rate": 1e-05,
+      "loss": 0.0522,
+      "num_tokens": 66318482.0,
+      "reward": 0.375,
+      "reward_std": 0.13994136452674866,
+      "rewards/accuracy_reward/mean": 0.375,
+      "rewards/accuracy_reward/std": 0.4860251843929291,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999471306800842,
+      "sampling/importance_sampling_ratio/min": 7.288413598871557e-06,
+      "sampling/sampling_logp_difference/max": 11.829224586486816,
+      "sampling/sampling_logp_difference/mean": 0.018109245225787163,
+      "step": 89
+    },
+    {
+      "clip_ratio/high_max": 1.7906912489706883e-05,
+      "clip_ratio/high_mean": 4.476728122426721e-06,
+      "clip_ratio/low_mean": 2.5812531305291486e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.0289259655091882e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16120.0,
+      "completions/mean_length": 5462.78125,
+      "completions/mean_terminated_length": 5200.67236328125,
+      "completions/min_length": 460.0,
+      "completions/min_terminated_length": 460.0,
+      "entropy": 0.9345141425728798,
+      "epoch": 0.0827966881324747,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0023930128663778305,
+      "learning_rate": 1e-05,
+      "loss": 0.0475,
+      "num_tokens": 67038582.0,
+      "reward": 0.46875,
+      "reward_std": 0.24435341358184814,
+      "rewards/accuracy_reward/mean": 0.46875,
+      "rewards/accuracy_reward/std": 0.5009832978248596,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999513030052185,
+      "sampling/importance_sampling_ratio/min": 0.008508839644491673,
+      "sampling/sampling_logp_difference/max": 4.7666497230529785,
+      "sampling/sampling_logp_difference/mean": 0.019220296293497086,
+      "step": 90
+    },
+    {
+      "clip_ratio/high_max": 1.551389118503721e-05,
+      "clip_ratio/high_mean": 3.878472796259302e-06,
+      "clip_ratio/low_mean": 3.239646628117043e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.6274939645863924e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15034.0,
+      "completions/max_terminated_length": 15034.0,
+      "completions/mean_length": 5547.5078125,
+      "completions/mean_terminated_length": 5547.5078125,
+      "completions/min_length": 14.0,
+      "completions/min_terminated_length": 14.0,
+      "entropy": 1.0511749312281609,
+      "epoch": 0.08371665133394664,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.0013633714988827705,
+      "learning_rate": 1e-05,
+      "loss": 0.0462,
+      "num_tokens": 67774487.0,
+      "reward": 0.203125,
+      "reward_std": 0.19438527524471283,
+      "rewards/accuracy_reward/mean": 0.203125,
+      "rewards/accuracy_reward/std": 0.40390563011169434,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999545216560364,
+      "sampling/importance_sampling_ratio/min": 1.0995515367540065e-05,
+      "sampling/sampling_logp_difference/max": 11.418023109436035,
+      "sampling/sampling_logp_difference/mean": 0.020328814163804054,
+      "step": 91
+    },
+    {
+      "clip_ratio/high_max": 1.5384989410449634e-05,
+      "clip_ratio/high_mean": 3.846247352612409e-06,
+      "clip_ratio/low_mean": 3.441604167164769e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.826228908110352e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14029.0,
+      "completions/mean_length": 5835.4140625,
+      "completions/mean_terminated_length": 5406.609375,
+      "completions/min_length": 384.0,
+      "completions/min_terminated_length": 384.0,
+      "entropy": 1.0024723336100578,
+      "epoch": 0.08463661453541858,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0036165034398436546,
+      "learning_rate": 1e-05,
+      "loss": 0.0373,
+      "num_tokens": 68541660.0,
+      "reward": 0.34375,
+      "reward_std": 0.3584783673286438,
+      "rewards/accuracy_reward/mean": 0.34375,
+      "rewards/accuracy_reward/std": 0.47682511806488037,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999669790267944,
+      "sampling/importance_sampling_ratio/min": 9.518130354990717e-06,
+      "sampling/sampling_logp_difference/max": 11.562312126159668,
+      "sampling/sampling_logp_difference/mean": 0.020469525828957558,
+      "step": 92
+    },
+    {
+      "clip_ratio/high_max": 6.105602551542688e-06,
+      "clip_ratio/high_mean": 1.526400637885672e-06,
+      "clip_ratio/low_mean": 5.3129634352444555e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.46560352177039e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15695.0,
+      "completions/mean_length": 6252.609375,
+      "completions/mean_terminated_length": 6172.83447265625,
+      "completions/min_length": 481.0,
+      "completions/min_terminated_length": 481.0,
+      "entropy": 1.0325519517064095,
+      "epoch": 0.08555657773689053,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0022011541295796633,
+      "learning_rate": 1e-05,
+      "loss": 0.036,
+      "num_tokens": 69365418.0,
+      "reward": 0.3828125,
+      "reward_std": 0.32301604747772217,
+      "rewards/accuracy_reward/mean": 0.3828125,
+      "rewards/accuracy_reward/std": 0.4879830479621887,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998809099197388,
+      "sampling/importance_sampling_ratio/min": 0.0005531083443202078,
+      "sampling/sampling_logp_difference/max": 7.4999566078186035,
+      "sampling/sampling_logp_difference/mean": 0.02079072594642639,
+      "step": 93
+    },
+    {
+      "clip_ratio/high_max": 4.348128641140647e-06,
+      "clip_ratio/high_mean": 1.0870321602851618e-06,
+      "clip_ratio/low_mean": 3.0097819148977578e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.118485085451539e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15316.0,
+      "completions/max_terminated_length": 15316.0,
+      "completions/mean_length": 5581.484375,
+      "completions/mean_terminated_length": 5581.484375,
+      "completions/min_length": 461.0,
+      "completions/min_terminated_length": 461.0,
+      "entropy": 0.9222500994801521,
+      "epoch": 0.08647654093836246,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.002300912281498313,
+      "learning_rate": 1e-05,
+      "loss": -0.0007,
+      "num_tokens": 70099320.0,
+      "reward": 0.296875,
+      "reward_std": 0.2959064245223999,
+      "rewards/accuracy_reward/mean": 0.296875,
+      "rewards/accuracy_reward/std": 0.45867621898651123,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998577833175659,
+      "sampling/importance_sampling_ratio/min": 8.140386853483506e-08,
+      "sampling/sampling_logp_difference/max": 16.323843002319336,
+      "sampling/sampling_logp_difference/mean": 0.01952272653579712,
+      "step": 94
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 3.5122252029395895e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.5122252029395895e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15781.0,
+      "completions/max_terminated_length": 15781.0,
+      "completions/mean_length": 5424.140625,
+      "completions/mean_terminated_length": 5424.140625,
+      "completions/min_length": 124.0,
+      "completions/min_terminated_length": 124.0,
+      "entropy": 1.0446564108133316,
+      "epoch": 0.08739650413983441,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0016312639927491546,
+      "learning_rate": 1e-05,
+      "loss": 0.0119,
+      "num_tokens": 70811474.0,
+      "reward": 0.359375,
+      "reward_std": 0.31246691942214966,
+      "rewards/accuracy_reward/mean": 0.359375,
+      "rewards/accuracy_reward/std": 0.481702595949173,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000094175338745,
+      "sampling/importance_sampling_ratio/min": 0.0021919538266956806,
+      "sampling/sampling_logp_difference/max": 6.12296199798584,
+      "sampling/sampling_logp_difference/mean": 0.019741754978895187,
+      "step": 95
+    },
+    {
+      "clip_ratio/high_max": 1.0354576261306647e-05,
+      "clip_ratio/high_mean": 3.496124691082514e-06,
+      "clip_ratio/low_mean": 4.096481598026003e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.446094089871622e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15755.0,
+      "completions/max_terminated_length": 15755.0,
+      "completions/mean_length": 5884.9609375,
+      "completions/mean_terminated_length": 5884.9609375,
+      "completions/min_length": 382.0,
+      "completions/min_terminated_length": 382.0,
+      "entropy": 0.9605691060423851,
+      "epoch": 0.08831646734130635,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0032865386456251144,
+      "learning_rate": 1e-05,
+      "loss": 0.0451,
+      "num_tokens": 71582701.0,
+      "reward": 0.4140625,
+      "reward_std": 0.3514111638069153,
+      "rewards/accuracy_reward/mean": 0.4140625,
+      "rewards/accuracy_reward/std": 0.49449479579925537,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999833106994629,
+      "sampling/importance_sampling_ratio/min": 1.149311810877407e-05,
+      "sampling/sampling_logp_difference/max": 11.373762130737305,
+      "sampling/sampling_logp_difference/mean": 0.019438734278082848,
+      "step": 96
+    },
+    {
+      "clip_ratio/high_max": 1.026998006636859e-05,
+      "clip_ratio/high_mean": 2.5674950165921473e-06,
+      "clip_ratio/low_mean": 3.5440503552308655e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.8007998455213965e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15361.0,
+      "completions/max_terminated_length": 15361.0,
+      "completions/mean_length": 4835.09375,
+      "completions/mean_terminated_length": 4835.09375,
+      "completions/min_length": 826.0,
+      "completions/min_terminated_length": 826.0,
+      "entropy": 0.9038172215223312,
+      "epoch": 0.08923643054277829,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.004721678793430328,
+      "learning_rate": 1e-05,
+      "loss": 0.1143,
+      "num_tokens": 72220025.0,
+      "reward": 0.4765625,
+      "reward_std": 0.38481879234313965,
+      "rewards/accuracy_reward/mean": 0.4765625,
+      "rewards/accuracy_reward/std": 0.5014128684997559,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.99994957447052,
+      "sampling/importance_sampling_ratio/min": 2.710051205667696e-07,
+      "sampling/sampling_logp_difference/max": 15.12112808227539,
+      "sampling/sampling_logp_difference/mean": 0.017888439819216728,
+      "step": 97
+    },
+    {
+      "clip_ratio/high_max": 2.93432283342554e-05,
+      "clip_ratio/high_mean": 9.56252398509605e-06,
+      "clip_ratio/low_mean": 4.7865792453194445e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.742831808674964e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14431.0,
+      "completions/mean_length": 5979.078125,
+      "completions/mean_terminated_length": 5897.1494140625,
+      "completions/min_length": 241.0,
+      "completions/min_terminated_length": 241.0,
+      "entropy": 1.0227951630949974,
+      "epoch": 0.09015639374425023,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0010532280430197716,
+      "learning_rate": 1e-05,
+      "loss": 0.0187,
+      "num_tokens": 73005515.0,
+      "reward": 0.2890625,
+      "reward_std": 0.30115631222724915,
+      "rewards/accuracy_reward/mean": 0.2890625,
+      "rewards/accuracy_reward/std": 0.45510825514793396,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999090433120728,
+      "sampling/importance_sampling_ratio/min": 0.00030157779110595584,
+      "sampling/sampling_logp_difference/max": 8.10648250579834,
+      "sampling/sampling_logp_difference/mean": 0.019633149728178978,
+      "step": 98
+    },
+    {
+      "clip_ratio/high_max": 4.203234766464448e-06,
+      "clip_ratio/high_mean": 1.050808691616112e-06,
+      "clip_ratio/low_mean": 2.5574990331733716e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.6625799137036665e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15886.0,
+      "completions/max_terminated_length": 15886.0,
+      "completions/mean_length": 4292.1796875,
+      "completions/mean_terminated_length": 4292.1796875,
+      "completions/min_length": 159.0,
+      "completions/min_terminated_length": 159.0,
+      "entropy": 0.8719984591007233,
+      "epoch": 0.09107635694572216,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0038324075285345316,
+      "learning_rate": 1e-05,
+      "loss": 0.0669,
+      "num_tokens": 73572794.0,
+      "reward": 0.4375,
+      "reward_std": 0.2972046136856079,
+      "rewards/accuracy_reward/mean": 0.4375,
+      "rewards/accuracy_reward/std": 0.49802759289741516,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999188780784607,
+      "sampling/importance_sampling_ratio/min": 0.015675775706768036,
+      "sampling/sampling_logp_difference/max": 4.155638694763184,
+      "sampling/sampling_logp_difference/mean": 0.018074234947562218,
+      "step": 99
+    },
+    {
+      "clip_ratio/high_max": 4.431366960488958e-06,
+      "clip_ratio/high_mean": 1.1078417401222396e-06,
+      "clip_ratio/low_mean": 4.433405501913512e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.54418968729442e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 14674.0,
+      "completions/max_terminated_length": 14674.0,
+      "completions/mean_length": 5449.2890625,
+      "completions/mean_terminated_length": 5449.2890625,
+      "completions/min_length": 635.0,
+      "completions/min_terminated_length": 635.0,
+      "entropy": 0.9137986451387405,
+      "epoch": 0.09199632014719411,
+      "frac_reward_zero_std": 0.125,
+      "grad_norm": 0.004843447357416153,
+      "learning_rate": 1e-05,
+      "loss": 0.0166,
+      "num_tokens": 74289607.0,
+      "reward": 0.5,
+      "reward_std": 0.40609243512153625,
+      "rewards/accuracy_reward/mean": 0.5,
+      "rewards/accuracy_reward/std": 0.5019646286964417,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999977707862854,
+      "sampling/importance_sampling_ratio/min": 8.851584993863071e-07,
+      "sampling/sampling_logp_difference/max": 13.937499046325684,
+      "sampling/sampling_logp_difference/mean": 0.018183842301368713,
+      "step": 100
+    },
+    {
+      "clip_ratio/high_max": 8.212076863856055e-06,
+      "clip_ratio/high_mean": 2.0530192159640137e-06,
+      "clip_ratio/low_mean": 3.6279372466196946e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.833239122741361e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 16163.0,
+      "completions/max_terminated_length": 16163.0,
+      "completions/mean_length": 4983.3515625,
+      "completions/mean_terminated_length": 4983.3515625,
+      "completions/min_length": 541.0,
+      "completions/min_terminated_length": 541.0,
+      "entropy": 0.9354705810546875,
+      "epoch": 0.09291628334866606,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0037651765160262585,
+      "learning_rate": 1e-05,
+      "loss": 0.0463,
+      "num_tokens": 74946484.0,
+      "reward": 0.3671875,
+      "reward_std": 0.3090519309043884,
+      "rewards/accuracy_reward/mean": 0.3671875,
+      "rewards/accuracy_reward/std": 0.4839322865009308,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999549984931946,
+      "sampling/importance_sampling_ratio/min": 0.00011593531962716952,
+      "sampling/sampling_logp_difference/max": 9.062478065490723,
+      "sampling/sampling_logp_difference/mean": 0.018207306042313576,
+      "step": 101
+    },
+    {
+      "clip_ratio/high_max": 1.3182888324081432e-05,
+      "clip_ratio/high_mean": 3.295722081020358e-06,
+      "clip_ratio/low_mean": 2.544108633628639e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.8736808644680423e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16039.0,
+      "completions/mean_length": 6351.1015625,
+      "completions/mean_terminated_length": 6027.45947265625,
+      "completions/min_length": 804.0,
+      "completions/min_terminated_length": 804.0,
+      "entropy": 0.9310042560100555,
+      "epoch": 0.09383624655013799,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0009160125628113747,
+      "learning_rate": 1e-05,
+      "loss": -0.023,
+      "num_tokens": 75779145.0,
+      "reward": 0.3828125,
+      "reward_std": 0.24329257011413574,
+      "rewards/accuracy_reward/mean": 0.3828125,
+      "rewards/accuracy_reward/std": 0.4879830479621887,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998877048492432,
+      "sampling/importance_sampling_ratio/min": 0.0002961359277833253,
+      "sampling/sampling_logp_difference/max": 8.1246919631958,
+      "sampling/sampling_logp_difference/mean": 0.018513178452849388,
+      "step": 102
+    },
+    {
+      "clip_ratio/high_max": 1.1402620202716207e-05,
+      "clip_ratio/high_mean": 3.935649147024378e-06,
+      "clip_ratio/low_mean": 3.059757568735222e-05,
+      "clip_ratio/low_min": 4.3258582991256844e-06,
+      "clip_ratio/region_mean": 3.45332257438713e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14471.0,
+      "completions/mean_length": 5293.40625,
+      "completions/mean_terminated_length": 4935.64501953125,
+      "completions/min_length": 222.0,
+      "completions/min_terminated_length": 222.0,
+      "entropy": 1.0732879787683487,
+      "epoch": 0.09475620975160993,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0023993055801838636,
+      "learning_rate": 1e-05,
+      "loss": 0.1021,
+      "num_tokens": 76475557.0,
+      "reward": 0.34375,
+      "reward_std": 0.2835350036621094,
+      "rewards/accuracy_reward/mean": 0.34375,
+      "rewards/accuracy_reward/std": 0.47682511806488037,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.000077724456787,
+      "sampling/importance_sampling_ratio/min": 6.613240111619234e-05,
+      "sampling/sampling_logp_difference/max": 9.623851776123047,
+      "sampling/sampling_logp_difference/mean": 0.020792219787836075,
+      "step": 103
+    },
+    {
+      "clip_ratio/high_max": 2.130644793396641e-05,
+      "clip_ratio/high_mean": 8.929533635182452e-06,
+      "clip_ratio/low_mean": 2.663600798769039e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.556554071337814e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16305.0,
+      "completions/mean_length": 7619.7578125,
+      "completions/mean_terminated_length": 7409.41650390625,
+      "completions/min_length": 25.0,
+      "completions/min_terminated_length": 25.0,
+      "entropy": 0.9646238535642624,
+      "epoch": 0.09567617295308188,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0014872358879074454,
+      "learning_rate": 1e-05,
+      "loss": 0.0439,
+      "num_tokens": 77474310.0,
+      "reward": 0.34375,
+      "reward_std": 0.33114904165267944,
+      "rewards/accuracy_reward/mean": 0.34375,
+      "rewards/accuracy_reward/std": 0.47682511806488037,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999638795852661,
+      "sampling/importance_sampling_ratio/min": 0.0016686831368133426,
+      "sampling/sampling_logp_difference/max": 6.395720481872559,
+      "sampling/sampling_logp_difference/mean": 0.020074717700481415,
+      "step": 104
+    },
+    {
+      "clip_ratio/high_max": 1.7765815300663235e-05,
+      "clip_ratio/high_mean": 5.154013138053415e-06,
+      "clip_ratio/low_mean": 5.166909659237717e-05,
+      "clip_ratio/low_min": 8.365680514543783e-06,
+      "clip_ratio/region_mean": 5.68231100714911e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15984.0,
+      "completions/max_terminated_length": 15984.0,
+      "completions/mean_length": 5959.921875,
+      "completions/mean_terminated_length": 5959.921875,
+      "completions/min_length": 55.0,
+      "completions/min_terminated_length": 55.0,
+      "entropy": 1.004471093416214,
+      "epoch": 0.09659613615455381,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.00398358516395092,
+      "learning_rate": 1e-05,
+      "loss": 0.1016,
+      "num_tokens": 78257132.0,
+      "reward": 0.359375,
+      "reward_std": 0.3653082847595215,
+      "rewards/accuracy_reward/mean": 0.359375,
+      "rewards/accuracy_reward/std": 0.481702595949173,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000170469284058,
+      "sampling/importance_sampling_ratio/min": 0.0030075267422944307,
+      "sampling/sampling_logp_difference/max": 5.806637287139893,
+      "sampling/sampling_logp_difference/mean": 0.020755283534526825,
+      "step": 105
+    },
+    {
+      "clip_ratio/high_max": 1.6946955838648137e-05,
+      "clip_ratio/high_mean": 4.236738959662034e-06,
+      "clip_ratio/low_mean": 4.510891039899434e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.934564867653535e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 13736.0,
+      "completions/mean_length": 5427.03125,
+      "completions/mean_terminated_length": 5340.755859375,
+      "completions/min_length": 556.0,
+      "completions/min_terminated_length": 556.0,
+      "entropy": 0.9117375314235687,
+      "epoch": 0.09751609935602576,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0019883522763848305,
+      "learning_rate": 1e-05,
+      "loss": 0.01,
+      "num_tokens": 78971072.0,
+      "reward": 0.375,
+      "reward_std": 0.31694266200065613,
+      "rewards/accuracy_reward/mean": 0.375,
+      "rewards/accuracy_reward/std": 0.4860251843929291,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000550746917725,
+      "sampling/importance_sampling_ratio/min": 0.0008046010043472052,
+      "sampling/sampling_logp_difference/max": 7.125164031982422,
+      "sampling/sampling_logp_difference/mean": 0.018812140449881554,
+      "step": 106
+    },
+    {
+      "clip_ratio/high_max": 2.968176841022796e-05,
+      "clip_ratio/high_mean": 7.42044210255699e-06,
+      "clip_ratio/low_mean": 3.220799408154562e-05,
+      "clip_ratio/low_min": 5.315981979947537e-06,
+      "clip_ratio/region_mean": 3.962843629778945e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 16293.0,
+      "completions/max_terminated_length": 16293.0,
+      "completions/mean_length": 6062.078125,
+      "completions/mean_terminated_length": 6062.078125,
+      "completions/min_length": 108.0,
+      "completions/min_terminated_length": 108.0,
+      "entropy": 1.0164100378751755,
+      "epoch": 0.0984360625574977,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.00450351694598794,
+      "learning_rate": 1e-05,
+      "loss": 0.0426,
+      "num_tokens": 79764434.0,
+      "reward": 0.2578125,
+      "reward_std": 0.26355957984924316,
+      "rewards/accuracy_reward/mean": 0.2578125,
+      "rewards/accuracy_reward/std": 0.43914902210235596,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999713897705078,
+      "sampling/importance_sampling_ratio/min": 0.0007411236292682588,
+      "sampling/sampling_logp_difference/max": 7.207343101501465,
+      "sampling/sampling_logp_difference/mean": 0.020526543259620667,
+      "step": 107
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 4.856050622947805e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.856050622947805e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 13689.0,
+      "completions/max_terminated_length": 13689.0,
+      "completions/mean_length": 4856.53125,
+      "completions/mean_terminated_length": 4856.53125,
+      "completions/min_length": 191.0,
+      "completions/min_terminated_length": 191.0,
+      "entropy": 1.0780886858701706,
+      "epoch": 0.09935602575896964,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.0033157530706375837,
+      "learning_rate": 1e-05,
+      "loss": 0.046,
+      "num_tokens": 80405238.0,
+      "reward": 0.3359375,
+      "reward_std": 0.3487703502178192,
+      "rewards/accuracy_reward/mean": 0.3359375,
+      "rewards/accuracy_reward/std": 0.47417303919792175,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999889135360718,
+      "sampling/importance_sampling_ratio/min": 0.033773623406887054,
+      "sampling/sampling_logp_difference/max": 3.7256407737731934,
+      "sampling/sampling_logp_difference/mean": 0.019188418984413147,
+      "step": 108
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 1.975351790406421e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 1.975351790406421e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 16335.0,
+      "completions/max_terminated_length": 16335.0,
+      "completions/mean_length": 3930.5859375,
+      "completions/mean_terminated_length": 3930.5859375,
+      "completions/min_length": 4.0,
+      "completions/min_terminated_length": 4.0,
+      "entropy": 0.8666863515973091,
+      "epoch": 0.10027598896044158,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.005471619311720133,
+      "learning_rate": 1e-05,
+      "loss": -0.0779,
+      "num_tokens": 80926721.0,
+      "reward": 0.5859375,
+      "reward_std": 0.3164186179637909,
+      "rewards/accuracy_reward/mean": 0.5859375,
+      "rewards/accuracy_reward/std": 0.49449479579925537,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000040531158447,
+      "sampling/importance_sampling_ratio/min": 0.0002562212466727942,
+      "sampling/sampling_logp_difference/max": 8.269469261169434,
+      "sampling/sampling_logp_difference/mean": 0.017708823084831238,
+      "step": 109
+    },
+    {
+      "clip_ratio/high_max": 6.743997801095247e-06,
+      "clip_ratio/high_mean": 1.6859994502738118e-06,
+      "clip_ratio/low_mean": 3.61007656692891e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.7786765119562915e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15546.0,
+      "completions/mean_length": 5934.9453125,
+      "completions/mean_terminated_length": 5684.16845703125,
+      "completions/min_length": 229.0,
+      "completions/min_terminated_length": 229.0,
+      "entropy": 0.9991667941212654,
+      "epoch": 0.10119595216191353,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.002580739092081785,
+      "learning_rate": 1e-05,
+      "loss": -0.0065,
+      "num_tokens": 81707978.0,
+      "reward": 0.3046875,
+      "reward_std": 0.24671243131160736,
+      "rewards/accuracy_reward/mean": 0.3046875,
+      "rewards/accuracy_reward/std": 0.46208351850509644,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000852346420288,
+      "sampling/importance_sampling_ratio/min": 0.002478762762621045,
+      "sampling/sampling_logp_difference/max": 5.999995708465576,
+      "sampling/sampling_logp_difference/mean": 0.019801246002316475,
+      "step": 110
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 2.43532002741631e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.43532002741631e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16010.0,
+      "completions/mean_length": 5866.84375,
+      "completions/mean_terminated_length": 5699.9052734375,
+      "completions/min_length": 499.0,
+      "completions/min_terminated_length": 499.0,
+      "entropy": 0.9848997294902802,
+      "epoch": 0.10211591536338546,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0010949905263260007,
+      "learning_rate": 1e-05,
+      "loss": 0.0266,
+      "num_tokens": 82477310.0,
+      "reward": 0.2734375,
+      "reward_std": 0.26933354139328003,
+      "rewards/accuracy_reward/mean": 0.2734375,
+      "rewards/accuracy_reward/std": 0.447474867105484,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999667406082153,
+      "sampling/importance_sampling_ratio/min": 9.04304688447155e-05,
+      "sampling/sampling_logp_difference/max": 9.310929298400879,
+      "sampling/sampling_logp_difference/mean": 0.020769795402884483,
+      "step": 111
+    },
+    {
+      "clip_ratio/high_max": 1.9307613456476247e-05,
+      "clip_ratio/high_mean": 4.826903364119062e-06,
+      "clip_ratio/low_mean": 5.842190330440644e-05,
+      "clip_ratio/low_min": 1.2287753634154797e-05,
+      "clip_ratio/region_mean": 6.324880496322294e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 14501.0,
+      "completions/max_terminated_length": 14501.0,
+      "completions/mean_length": 6613.7578125,
+      "completions/mean_terminated_length": 6613.7578125,
+      "completions/min_length": 1033.0,
+      "completions/min_terminated_length": 1033.0,
+      "entropy": 0.9176012054085732,
+      "epoch": 0.10303587856485741,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0020384234376251698,
+      "learning_rate": 1e-05,
+      "loss": 0.0571,
+      "num_tokens": 83345055.0,
+      "reward": 0.3671875,
+      "reward_std": 0.2845958471298218,
+      "rewards/accuracy_reward/mean": 0.3671875,
+      "rewards/accuracy_reward/std": 0.4839322865009308,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999457001686096,
+      "sampling/importance_sampling_ratio/min": 0.029541675001382828,
+      "sampling/sampling_logp_difference/max": 3.5219533443450928,
+      "sampling/sampling_logp_difference/mean": 0.018883168697357178,
+      "step": 112
+    },
+    {
+      "clip_ratio/high_max": 1.382043183184578e-05,
+      "clip_ratio/high_mean": 3.455107957961445e-06,
+      "clip_ratio/low_mean": 5.789885449303256e-05,
+      "clip_ratio/low_min": 1.017130716718384e-05,
+      "clip_ratio/region_mean": 6.135396188255982e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16310.0,
+      "completions/mean_length": 6392.3125,
+      "completions/mean_terminated_length": 6070.0,
+      "completions/min_length": 507.0,
+      "completions/min_terminated_length": 507.0,
+      "entropy": 0.904954232275486,
+      "epoch": 0.10395584176632934,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0031166900880634785,
+      "learning_rate": 1e-05,
+      "loss": 0.0351,
+      "num_tokens": 84186343.0,
+      "reward": 0.390625,
+      "reward_std": 0.34139877557754517,
+      "rewards/accuracy_reward/mean": 0.390625,
+      "rewards/accuracy_reward/std": 0.4898075461387634,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999208450317383,
+      "sampling/importance_sampling_ratio/min": 0.00022529886336997151,
+      "sampling/sampling_logp_difference/max": 8.398082733154297,
+      "sampling/sampling_logp_difference/mean": 0.01931958645582199,
+      "step": 113
+    },
+    {
+      "clip_ratio/high_max": 1.7221671441802755e-05,
+      "clip_ratio/high_mean": 6.549099907715572e-06,
+      "clip_ratio/low_mean": 3.147818074467068e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.802728065238625e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16180.0,
+      "completions/mean_length": 5982.703125,
+      "completions/mean_terminated_length": 5817.603515625,
+      "completions/min_length": 294.0,
+      "completions/min_terminated_length": 294.0,
+      "entropy": 0.8394555225968361,
+      "epoch": 0.10487580496780129,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0022041688207536936,
+      "learning_rate": 1e-05,
+      "loss": 0.1043,
+      "num_tokens": 84971129.0,
+      "reward": 0.3125,
+      "reward_std": 0.30774885416030884,
+      "rewards/accuracy_reward/mean": 0.3125,
+      "rewards/accuracy_reward/std": 0.4653336703777313,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999030828475952,
+      "sampling/importance_sampling_ratio/min": 1.553593506287143e-06,
+      "sampling/sampling_logp_difference/max": 13.374939918518066,
+      "sampling/sampling_logp_difference/mean": 0.01795877143740654,
+      "step": 114
+    },
+    {
+      "clip_ratio/high_max": 2.9651660042873118e-05,
+      "clip_ratio/high_mean": 9.398806923854863e-06,
+      "clip_ratio/low_mean": 4.788733849636628e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.728614519284747e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14988.0,
+      "completions/mean_length": 4976.921875,
+      "completions/mean_terminated_length": 4608.95166015625,
+      "completions/min_length": 335.0,
+      "completions/min_terminated_length": 335.0,
+      "entropy": 0.8381234556436539,
+      "epoch": 0.10579576816927323,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.0037972736172378063,
+      "learning_rate": 1e-05,
+      "loss": 0.1244,
+      "num_tokens": 85625559.0,
+      "reward": 0.4765625,
+      "reward_std": 0.39082521200180054,
+      "rewards/accuracy_reward/mean": 0.4765625,
+      "rewards/accuracy_reward/std": 0.5014128684997559,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999970555305481,
+      "sampling/importance_sampling_ratio/min": 0.002990707289427519,
+      "sampling/sampling_logp_difference/max": 5.8122453689575195,
+      "sampling/sampling_logp_difference/mean": 0.01815030723810196,
+      "step": 115
+    },
+    {
+      "clip_ratio/high_max": 4.130592969886493e-06,
+      "clip_ratio/high_mean": 1.0326482424716232e-06,
+      "clip_ratio/low_mean": 1.6904315600640984e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 1.7936963843112608e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15984.0,
+      "completions/mean_length": 6307.2421875,
+      "completions/mean_terminated_length": 6065.400390625,
+      "completions/min_length": 823.0,
+      "completions/min_terminated_length": 823.0,
+      "entropy": 1.1176434755325317,
+      "epoch": 0.10671573137074516,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0012413962977007031,
+      "learning_rate": 1e-05,
+      "loss": 0.0146,
+      "num_tokens": 86453606.0,
+      "reward": 0.28125,
+      "reward_std": 0.2280253767967224,
+      "rewards/accuracy_reward/mean": 0.28125,
+      "rewards/accuracy_reward/std": 0.4513758420944214,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000064373016357,
+      "sampling/importance_sampling_ratio/min": 0.004730688873678446,
+      "sampling/sampling_logp_difference/max": 5.353684425354004,
+      "sampling/sampling_logp_difference/mean": 0.021790307015180588,
+      "step": 116
+    },
+    {
+      "clip_ratio/high_max": 1.3160772823539446e-05,
+      "clip_ratio/high_mean": 3.2901932058848615e-06,
+      "clip_ratio/low_mean": 3.582628983167524e-05,
+      "clip_ratio/low_min": 2.61966624748311e-06,
+      "clip_ratio/region_mean": 3.911648195753514e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16329.0,
+      "completions/mean_length": 7263.1640625,
+      "completions/mean_terminated_length": 7044.26416015625,
+      "completions/min_length": 48.0,
+      "completions/min_terminated_length": 48.0,
+      "entropy": 1.107876107096672,
+      "epoch": 0.10763569457221711,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0017762042116373777,
+      "learning_rate": 1e-05,
+      "loss": 0.0349,
+      "num_tokens": 87402763.0,
+      "reward": 0.2578125,
+      "reward_std": 0.27776598930358887,
+      "rewards/accuracy_reward/mean": 0.2578125,
+      "rewards/accuracy_reward/std": 0.43914902210235596,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999741315841675,
+      "sampling/importance_sampling_ratio/min": 0.0009408573969267309,
+      "sampling/sampling_logp_difference/max": 6.968719005584717,
+      "sampling/sampling_logp_difference/mean": 0.02103034406900406,
+      "step": 117
+    },
+    {
+      "clip_ratio/high_max": 3.987745776612428e-05,
+      "clip_ratio/high_mean": 1.1877163728968299e-05,
+      "clip_ratio/low_mean": 4.26799579145154e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.455712096136267e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15416.0,
+      "completions/mean_length": 5093.859375,
+      "completions/mean_terminated_length": 4914.65087890625,
+      "completions/min_length": 364.0,
+      "completions/min_terminated_length": 364.0,
+      "entropy": 1.1065888702869415,
+      "epoch": 0.10855565777368906,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0032127038575708866,
+      "learning_rate": 1e-05,
+      "loss": 0.0194,
+      "num_tokens": 88077385.0,
+      "reward": 0.421875,
+      "reward_std": 0.345874547958374,
+      "rewards/accuracy_reward/mean": 0.421875,
+      "rewards/accuracy_reward/std": 0.4957992732524872,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999171495437622,
+      "sampling/importance_sampling_ratio/min": 7.033879228401929e-05,
+      "sampling/sampling_logp_difference/max": 9.562187194824219,
+      "sampling/sampling_logp_difference/mean": 0.020314980298280716,
+      "step": 118
+    },
+    {
+      "clip_ratio/high_max": 9.35208754526684e-06,
+      "clip_ratio/high_mean": 4.4788730519940145e-06,
+      "clip_ratio/low_mean": 3.470697703278347e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.918584917528278e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15740.0,
+      "completions/mean_length": 6943.53125,
+      "completions/mean_terminated_length": 6639.0,
+      "completions/min_length": 307.0,
+      "completions/min_terminated_length": 307.0,
+      "entropy": 0.9009081721305847,
+      "epoch": 0.10947562097516099,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0028925195802003145,
+      "learning_rate": 1e-05,
+      "loss": 0.0862,
+      "num_tokens": 88985269.0,
+      "reward": 0.3984375,
+      "reward_std": 0.3535328209400177,
+      "rewards/accuracy_reward/mean": 0.3984375,
+      "rewards/accuracy_reward/std": 0.4915000796318054,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999980628490448,
+      "sampling/importance_sampling_ratio/min": 6.553035092338177e-08,
+      "sampling/sampling_logp_difference/max": 16.540752410888672,
+      "sampling/sampling_logp_difference/mean": 0.019378282129764557,
+      "step": 119
+    },
+    {
+      "clip_ratio/high_max": 1.0939961612166371e-05,
+      "clip_ratio/high_mean": 2.734990403041593e-06,
+      "clip_ratio/low_mean": 2.4615862798782473e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.7350853201824066e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15148.0,
+      "completions/max_terminated_length": 15148.0,
+      "completions/mean_length": 4976.25,
+      "completions/mean_terminated_length": 4976.25,
+      "completions/min_length": 702.0,
+      "completions/min_terminated_length": 702.0,
+      "entropy": 0.9463540017604828,
+      "epoch": 0.11039558417663294,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0017386430408805609,
+      "learning_rate": 1e-05,
+      "loss": 0.0215,
+      "num_tokens": 89645205.0,
+      "reward": 0.359375,
+      "reward_std": 0.26462042331695557,
+      "rewards/accuracy_reward/mean": 0.359375,
+      "rewards/accuracy_reward/std": 0.481702595949173,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999554753303528,
+      "sampling/importance_sampling_ratio/min": 7.889595508459024e-06,
+      "sampling/sampling_logp_difference/max": 11.74996566772461,
+      "sampling/sampling_logp_difference/mean": 0.018035830929875374,
+      "step": 120
+    },
+    {
+      "clip_ratio/high_max": 5.941629297012696e-06,
+      "clip_ratio/high_mean": 1.485407324253174e-06,
+      "clip_ratio/low_mean": 2.6826061798601586e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.8311469009167922e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15892.0,
+      "completions/mean_length": 6439.5390625,
+      "completions/mean_terminated_length": 6281.69091796875,
+      "completions/min_length": 959.0,
+      "completions/min_terminated_length": 959.0,
+      "entropy": 0.899876207113266,
+      "epoch": 0.11131554737810488,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0037381781730800867,
+      "learning_rate": 1e-05,
+      "loss": 0.0666,
+      "num_tokens": 90489394.0,
+      "reward": 0.3203125,
+      "reward_std": 0.2624938488006592,
+      "rewards/accuracy_reward/mean": 0.3203125,
+      "rewards/accuracy_reward/std": 0.4684300124645233,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999206066131592,
+      "sampling/importance_sampling_ratio/min": 0.003606764366850257,
+      "sampling/sampling_logp_difference/max": 5.62494421005249,
+      "sampling/sampling_logp_difference/mean": 0.019368179142475128,
+      "step": 121
+    },
+    {
+      "clip_ratio/high_max": 5.189952389628161e-06,
+      "clip_ratio/high_mean": 1.2974880974070402e-06,
+      "clip_ratio/low_mean": 3.058137212974543e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.187886022715247e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15979.0,
+      "completions/mean_length": 6876.46875,
+      "completions/mean_terminated_length": 6408.884765625,
+      "completions/min_length": 4.0,
+      "completions/min_terminated_length": 4.0,
+      "entropy": 1.1018569767475128,
+      "epoch": 0.11223551057957681,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0018562980694696307,
+      "learning_rate": 1e-05,
+      "loss": 0.095,
+      "num_tokens": 91390054.0,
+      "reward": 0.21875,
+      "reward_std": 0.29955869913101196,
+      "rewards/accuracy_reward/mean": 0.21875,
+      "rewards/accuracy_reward/std": 0.41502299904823303,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999849796295166,
+      "sampling/importance_sampling_ratio/min": 2.9343695132411085e-05,
+      "sampling/sampling_logp_difference/max": 10.436432838439941,
+      "sampling/sampling_logp_difference/mean": 0.020825792104005814,
+      "step": 122
+    },
+    {
+      "clip_ratio/high_max": 2.022083435804234e-05,
+      "clip_ratio/high_mean": 5.055208589510585e-06,
+      "clip_ratio/low_mean": 3.029032552603894e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.53455343429232e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14153.0,
+      "completions/mean_length": 6501.5078125,
+      "completions/mean_terminated_length": 6344.64306640625,
+      "completions/min_length": 720.0,
+      "completions/min_terminated_length": 720.0,
+      "entropy": 1.073579266667366,
+      "epoch": 0.11315547378104876,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0016695430967956781,
+      "learning_rate": 1e-05,
+      "loss": 0.0552,
+      "num_tokens": 92241535.0,
+      "reward": 0.2734375,
+      "reward_std": 0.28641316294670105,
+      "rewards/accuracy_reward/mean": 0.2734375,
+      "rewards/accuracy_reward/std": 0.447474867105484,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998984336853027,
+      "sampling/importance_sampling_ratio/min": 0.0002380236255703494,
+      "sampling/sampling_logp_difference/max": 8.343140602111816,
+      "sampling/sampling_logp_difference/mean": 0.020438479259610176,
+      "step": 123
+    },
+    {
+      "clip_ratio/high_max": 3.3911180707946187e-06,
+      "clip_ratio/high_mean": 8.477795176986547e-07,
+      "clip_ratio/low_mean": 2.2190370486896427e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.30381500614385e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 14345.0,
+      "completions/max_terminated_length": 14345.0,
+      "completions/mean_length": 5474.1328125,
+      "completions/mean_terminated_length": 5474.1328125,
+      "completions/min_length": 14.0,
+      "completions/min_terminated_length": 14.0,
+      "entropy": 1.0692576617002487,
+      "epoch": 0.1140754369825207,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0034909825772047043,
+      "learning_rate": 1e-05,
+      "loss": 0.0,
+      "num_tokens": 92962472.0,
+      "reward": 0.3046875,
+      "reward_std": 0.27564430236816406,
+      "rewards/accuracy_reward/mean": 0.3046875,
+      "rewards/accuracy_reward/std": 0.46208351850509644,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.000006079673767,
+      "sampling/importance_sampling_ratio/min": 0.0017851731972768903,
+      "sampling/sampling_logp_difference/max": 6.328239917755127,
+      "sampling/sampling_logp_difference/mean": 0.019930578768253326,
+      "step": 124
+    },
+    {
+      "clip_ratio/high_max": 2.6292200345778838e-05,
+      "clip_ratio/high_mean": 7.620442374900449e-06,
+      "clip_ratio/low_mean": 4.615546390596137e-05,
+      "clip_ratio/low_min": 1.366510537081922e-05,
+      "clip_ratio/region_mean": 5.3775906508235494e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16198.0,
+      "completions/mean_length": 7512.078125,
+      "completions/mean_terminated_length": 7225.88671875,
+      "completions/min_length": 486.0,
+      "completions/min_terminated_length": 486.0,
+      "entropy": 0.9676955863833427,
+      "epoch": 0.11499540018399264,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0023449272848665714,
+      "learning_rate": 1e-05,
+      "loss": 0.0454,
+      "num_tokens": 93950506.0,
+      "reward": 0.3203125,
+      "reward_std": 0.22461043298244476,
+      "rewards/accuracy_reward/mean": 0.3203125,
+      "rewards/accuracy_reward/std": 0.4684300124645233,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999359250068665,
+      "sampling/importance_sampling_ratio/min": 0.0016406332142651081,
+      "sampling/sampling_logp_difference/max": 6.412672996520996,
+      "sampling/sampling_logp_difference/mean": 0.020141655579209328,
+      "step": 125
+    },
+    {
+      "clip_ratio/high_max": 5.097255780128762e-06,
+      "clip_ratio/high_mean": 1.2743139450321905e-06,
+      "clip_ratio/low_mean": 3.3802551342887455e-05,
+      "clip_ratio/low_min": 4.146762421441963e-06,
+      "clip_ratio/region_mean": 3.5076865287919645e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16183.0,
+      "completions/mean_length": 6920.484375,
+      "completions/mean_terminated_length": 6693.3603515625,
+      "completions/min_length": 962.0,
+      "completions/min_terminated_length": 962.0,
+      "entropy": 0.8662540689110756,
+      "epoch": 0.11591536338546458,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0037103090435266495,
+      "learning_rate": 1e-05,
+      "loss": 0.0617,
+      "num_tokens": 94854016.0,
+      "reward": 0.4375,
+      "reward_std": 0.322716623544693,
+      "rewards/accuracy_reward/mean": 0.4375,
+      "rewards/accuracy_reward/std": 0.49802759289741516,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999761581420898,
+      "sampling/importance_sampling_ratio/min": 0.00047686786274425685,
+      "sampling/sampling_logp_difference/max": 7.648271083831787,
+      "sampling/sampling_logp_difference/mean": 0.01915796287357807,
+      "step": 126
+    },
+    {
+      "clip_ratio/high_max": 8.4922439782531e-06,
+      "clip_ratio/high_mean": 2.123060994563275e-06,
+      "clip_ratio/low_mean": 5.024227584726759e-05,
+      "clip_ratio/low_min": 1.3627016414829995e-05,
+      "clip_ratio/region_mean": 5.236533706920454e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15867.0,
+      "completions/mean_length": 7939.609375,
+      "completions/mean_terminated_length": 7805.57177734375,
+      "completions/min_length": 1260.0,
+      "completions/min_terminated_length": 1260.0,
+      "entropy": 0.9707008600234985,
+      "epoch": 0.11683532658693652,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0024642283096909523,
+      "learning_rate": 1e-05,
+      "loss": 0.0788,
+      "num_tokens": 95889966.0,
+      "reward": 0.2265625,
+      "reward_std": 0.27434611320495605,
+      "rewards/accuracy_reward/mean": 0.2265625,
+      "rewards/accuracy_reward/std": 0.4202519655227661,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998771548271179,
+      "sampling/importance_sampling_ratio/min": 4.540014560916461e-05,
+      "sampling/sampling_logp_difference/max": 9.999995231628418,
+      "sampling/sampling_logp_difference/mean": 0.020453302189707756,
+      "step": 127
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 3.766829564710861e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.766829564710861e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14969.0,
+      "completions/mean_length": 5985.8203125,
+      "completions/mean_terminated_length": 5474.43408203125,
+      "completions/min_length": 315.0,
+      "completions/min_terminated_length": 315.0,
+      "entropy": 0.9083090648055077,
+      "epoch": 0.11775528978840846,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.003317479742690921,
+      "learning_rate": 1e-05,
+      "loss": 0.0537,
+      "num_tokens": 96676847.0,
+      "reward": 0.3671875,
+      "reward_std": 0.287486732006073,
+      "rewards/accuracy_reward/mean": 0.3671875,
+      "rewards/accuracy_reward/std": 0.4839322865009308,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999130964279175,
+      "sampling/importance_sampling_ratio/min": 0.000286750087980181,
+      "sampling/sampling_logp_difference/max": 8.156899452209473,
+      "sampling/sampling_logp_difference/mean": 0.01996719278395176,
+      "step": 128
+    },
+    {
+      "clip_ratio/high_max": 1.8439853647578275e-05,
+      "clip_ratio/high_mean": 4.609963411894569e-06,
+      "clip_ratio/low_mean": 5.708034223061986e-05,
+      "clip_ratio/low_min": 2.75287948170444e-06,
+      "clip_ratio/region_mean": 6.169030598357494e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15081.0,
+      "completions/mean_length": 6565.359375,
+      "completions/mean_terminated_length": 6488.04736328125,
+      "completions/min_length": 248.0,
+      "completions/min_terminated_length": 248.0,
+      "entropy": 1.1013468354940414,
+      "epoch": 0.11867525298988041,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0019073591101914644,
+      "learning_rate": 1e-05,
+      "loss": 0.0622,
+      "num_tokens": 97539453.0,
+      "reward": 0.2734375,
+      "reward_std": 0.307217001914978,
+      "rewards/accuracy_reward/mean": 0.2734375,
+      "rewards/accuracy_reward/std": 0.447474867105484,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999555945396423,
+      "sampling/importance_sampling_ratio/min": 0.0006022047018632293,
+      "sampling/sampling_logp_difference/max": 7.414913177490234,
+      "sampling/sampling_logp_difference/mean": 0.02150837704539299,
+      "step": 129
+    },
+    {
+      "clip_ratio/high_max": 9.068485269381199e-06,
+      "clip_ratio/high_mean": 2.2671213173452998e-06,
+      "clip_ratio/low_mean": 1.9822365402433206e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.208948649240483e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16099.0,
+      "completions/mean_length": 6779.6171875,
+      "completions/mean_terminated_length": 6703.9921875,
+      "completions/min_length": 4.0,
+      "completions/min_terminated_length": 4.0,
+      "entropy": 0.8940552547574043,
+      "epoch": 0.11959521619135234,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0010163087863475084,
+      "learning_rate": 1e-05,
+      "loss": 0.0249,
+      "num_tokens": 98429036.0,
+      "reward": 0.453125,
+      "reward_std": 0.22567126154899597,
+      "rewards/accuracy_reward/mean": 0.453125,
+      "rewards/accuracy_reward/std": 0.4997538626194,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999485015869141,
+      "sampling/importance_sampling_ratio/min": 3.464699460664633e-08,
+      "sampling/sampling_logp_difference/max": 17.178054809570312,
+      "sampling/sampling_logp_difference/mean": 0.018716152757406235,
+      "step": 130
+    },
+    {
+      "clip_ratio/high_max": 5.047242211730918e-06,
+      "clip_ratio/high_mean": 1.2618105529327295e-06,
+      "clip_ratio/low_mean": 2.9014110396019532e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.0275920835265424e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 14549.0,
+      "completions/max_terminated_length": 14549.0,
+      "completions/mean_length": 5766.71875,
+      "completions/mean_terminated_length": 5766.71875,
+      "completions/min_length": 47.0,
+      "completions/min_terminated_length": 47.0,
+      "entropy": 1.0455922111868858,
+      "epoch": 0.12051517939282429,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.002155766822397709,
+      "learning_rate": 1e-05,
+      "loss": 0.0238,
+      "num_tokens": 99184264.0,
+      "reward": 0.4140625,
+      "reward_std": 0.3077537715435028,
+      "rewards/accuracy_reward/mean": 0.4140625,
+      "rewards/accuracy_reward/std": 0.49449479579925537,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999253749847412,
+      "sampling/importance_sampling_ratio/min": 0.00010798005678225309,
+      "sampling/sampling_logp_difference/max": 9.133563995361328,
+      "sampling/sampling_logp_difference/mean": 0.020948775112628937,
+      "step": 131
+    },
+    {
+      "clip_ratio/high_max": 2.0882574972347356e-05,
+      "clip_ratio/high_mean": 6.505383225885453e-06,
+      "clip_ratio/low_mean": 4.496008500609605e-05,
+      "clip_ratio/low_min": 7.757854064038838e-06,
+      "clip_ratio/region_mean": 5.1465468231981504e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14704.0,
+      "completions/mean_length": 6167.2421875,
+      "completions/mean_terminated_length": 6005.07177734375,
+      "completions/min_length": 218.0,
+      "completions/min_terminated_length": 218.0,
+      "entropy": 0.9100174158811569,
+      "epoch": 0.12143514259429623,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.0021464223973453045,
+      "learning_rate": 1e-05,
+      "loss": -0.0279,
+      "num_tokens": 99996831.0,
+      "reward": 0.421875,
+      "reward_std": 0.3916535973548889,
+      "rewards/accuracy_reward/mean": 0.421875,
+      "rewards/accuracy_reward/std": 0.4957992732524872,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999240040779114,
+      "sampling/importance_sampling_ratio/min": 0.02249590866267681,
+      "sampling/sampling_logp_difference/max": 3.794421911239624,
+      "sampling/sampling_logp_difference/mean": 0.01866895705461502,
+      "step": 132
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 3.0998018473837874e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.0998018473837874e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15738.0,
+      "completions/mean_length": 6242.9453125,
+      "completions/mean_terminated_length": 6163.09423828125,
+      "completions/min_length": 1187.0,
+      "completions/min_terminated_length": 1187.0,
+      "entropy": 0.8624134212732315,
+      "epoch": 0.12235510579576817,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0023277695290744305,
+      "learning_rate": 1e-05,
+      "loss": 0.0524,
+      "num_tokens": 100814112.0,
+      "reward": 0.3984375,
+      "reward_std": 0.2869499623775482,
+      "rewards/accuracy_reward/mean": 0.3984375,
+      "rewards/accuracy_reward/std": 0.4915000796318054,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999959409236908,
+      "sampling/importance_sampling_ratio/min": 0.0002393616596236825,
+      "sampling/sampling_logp_difference/max": 8.33753490447998,
+      "sampling/sampling_logp_difference/mean": 0.0191188994795084,
+      "step": 133
+    },
+    {
+      "clip_ratio/high_max": 6.589872555196052e-06,
+      "clip_ratio/high_mean": 1.647468138799013e-06,
+      "clip_ratio/low_mean": 4.329304238126497e-05,
+      "clip_ratio/low_min": 3.5120251595799346e-06,
+      "clip_ratio/region_mean": 4.494051017900347e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14866.0,
+      "completions/mean_length": 5733.6875,
+      "completions/mean_terminated_length": 5478.080078125,
+      "completions/min_length": 789.0,
+      "completions/min_terminated_length": 789.0,
+      "entropy": 0.9628067463636398,
+      "epoch": 0.12327506899724011,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.003547821193933487,
+      "learning_rate": 1e-05,
+      "loss": 0.0321,
+      "num_tokens": 101566264.0,
+      "reward": 0.3984375,
+      "reward_std": 0.36584997177124023,
+      "rewards/accuracy_reward/mean": 0.3984375,
+      "rewards/accuracy_reward/std": 0.4915000796318054,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999400973320007,
+      "sampling/importance_sampling_ratio/min": 0.0001282035664189607,
+      "sampling/sampling_logp_difference/max": 8.961891174316406,
+      "sampling/sampling_logp_difference/mean": 0.019646761938929558,
+      "step": 134
+    },
+    {
+      "clip_ratio/high_max": 1.7107527582993498e-05,
+      "clip_ratio/high_mean": 4.2768818957483745e-06,
+      "clip_ratio/low_mean": 3.014796902789385e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.442485103732906e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15848.0,
+      "completions/max_terminated_length": 15848.0,
+      "completions/mean_length": 5505.9375,
+      "completions/mean_terminated_length": 5505.9375,
+      "completions/min_length": 668.0,
+      "completions/min_terminated_length": 668.0,
+      "entropy": 0.8041045889258385,
+      "epoch": 0.12419503219871206,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0024891747161746025,
+      "learning_rate": 1e-05,
+      "loss": 0.1406,
+      "num_tokens": 102291456.0,
+      "reward": 0.5,
+      "reward_std": 0.35482609272003174,
+      "rewards/accuracy_reward/mean": 0.5,
+      "rewards/accuracy_reward/std": 0.5019646286964417,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999248385429382,
+      "sampling/importance_sampling_ratio/min": 0.0014627616619691253,
+      "sampling/sampling_logp_difference/max": 6.527429103851318,
+      "sampling/sampling_logp_difference/mean": 0.01716250739991665,
+      "step": 135
+    },
+    {
+      "clip_ratio/high_max": 1.548903105685895e-05,
+      "clip_ratio/high_mean": 3.872257764214737e-06,
+      "clip_ratio/low_mean": 5.380711581892683e-05,
+      "clip_ratio/low_min": 4.5777483137499075e-06,
+      "clip_ratio/region_mean": 5.767937363998499e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 16005.0,
+      "completions/max_terminated_length": 16005.0,
+      "completions/mean_length": 5003.0625,
+      "completions/mean_terminated_length": 5003.0625,
+      "completions/min_length": 497.0,
+      "completions/min_terminated_length": 497.0,
+      "entropy": 0.9115714654326439,
+      "epoch": 0.125114995400184,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.00220683915540576,
+      "learning_rate": 1e-05,
+      "loss": 0.1361,
+      "num_tokens": 102949824.0,
+      "reward": 0.4140625,
+      "reward_std": 0.3158818483352661,
+      "rewards/accuracy_reward/mean": 0.4140625,
+      "rewards/accuracy_reward/std": 0.49449479579925537,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999973714351654,
+      "sampling/importance_sampling_ratio/min": 8.323705696966499e-05,
+      "sampling/sampling_logp_difference/max": 9.393817901611328,
+      "sampling/sampling_logp_difference/mean": 0.018076512962579727,
+      "step": 136
+    },
+    {
+      "clip_ratio/high_max": 2.181136096623959e-05,
+      "clip_ratio/high_mean": 5.4528402415598975e-06,
+      "clip_ratio/low_mean": 3.4416837252138066e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.986967681157694e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15658.0,
+      "completions/max_terminated_length": 15658.0,
+      "completions/mean_length": 4742.1328125,
+      "completions/mean_terminated_length": 4742.1328125,
+      "completions/min_length": 462.0,
+      "completions/min_terminated_length": 462.0,
+      "entropy": 0.9430246204137802,
+      "epoch": 0.12603495860165592,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.003964806906878948,
+      "learning_rate": 1e-05,
+      "loss": 0.0215,
+      "num_tokens": 103580913.0,
+      "reward": 0.4609375,
+      "reward_std": 0.2914257347583771,
+      "rewards/accuracy_reward/mean": 0.4609375,
+      "rewards/accuracy_reward/std": 0.5004304051399231,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999952495098114,
+      "sampling/importance_sampling_ratio/min": 7.031940185697749e-05,
+      "sampling/sampling_logp_difference/max": 9.56246280670166,
+      "sampling/sampling_logp_difference/mean": 0.019651200622320175,
+      "step": 137
+    },
+    {
+      "clip_ratio/high_max": 4.07684046876966e-06,
+      "clip_ratio/high_mean": 1.019210117192415e-06,
+      "clip_ratio/low_mean": 3.8682398553646635e-05,
+      "clip_ratio/low_min": 8.189203072106466e-06,
+      "clip_ratio/region_mean": 3.970160832977854e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15944.0,
+      "completions/mean_length": 6574.171875,
+      "completions/mean_terminated_length": 6091.72119140625,
+      "completions/min_length": 10.0,
+      "completions/min_terminated_length": 10.0,
+      "entropy": 0.8429529070854187,
+      "epoch": 0.12695492180312787,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.002067410387098789,
+      "learning_rate": 1e-05,
+      "loss": 0.0377,
+      "num_tokens": 104447463.0,
+      "reward": 0.3125,
+      "reward_std": 0.24511480331420898,
+      "rewards/accuracy_reward/mean": 0.3125,
+      "rewards/accuracy_reward/std": 0.4653336703777313,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9997583627700806,
+      "sampling/importance_sampling_ratio/min": 0.00021258489869069308,
+      "sampling/sampling_logp_difference/max": 8.456169128417969,
+      "sampling/sampling_logp_difference/mean": 0.018853647634387016,
+      "step": 138
+    },
+    {
+      "clip_ratio/high_max": 1.9725823221961036e-05,
+      "clip_ratio/high_mean": 4.931455805490259e-06,
+      "clip_ratio/low_mean": 5.9263072444082354e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 6.419452870431996e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15518.0,
+      "completions/max_terminated_length": 15518.0,
+      "completions/mean_length": 4581.5625,
+      "completions/mean_terminated_length": 4581.5625,
+      "completions/min_length": 301.0,
+      "completions/min_terminated_length": 301.0,
+      "entropy": 0.7094272822141647,
+      "epoch": 0.12787488500459981,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.004292502999305725,
+      "learning_rate": 1e-05,
+      "loss": 0.0946,
+      "num_tokens": 105052287.0,
+      "reward": 0.625,
+      "reward_std": 0.3908300995826721,
+      "rewards/accuracy_reward/mean": 0.625,
+      "rewards/accuracy_reward/std": 0.4860251843929291,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999477863311768,
+      "sampling/importance_sampling_ratio/min": 0.0019342642044648528,
+      "sampling/sampling_logp_difference/max": 6.24802827835083,
+      "sampling/sampling_logp_difference/mean": 0.016310662031173706,
+      "step": 139
+    },
+    {
+      "clip_ratio/high_max": 1.0132298029930098e-05,
+      "clip_ratio/high_mean": 2.5330745074825245e-06,
+      "clip_ratio/low_mean": 4.6397121650443296e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.893019581686531e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16097.0,
+      "completions/mean_length": 7066.4453125,
+      "completions/mean_terminated_length": 6918.5478515625,
+      "completions/min_length": 990.0,
+      "completions/min_terminated_length": 990.0,
+      "entropy": 0.8481669947504997,
+      "epoch": 0.12879484820607176,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0015785128343850374,
+      "learning_rate": 1e-05,
+      "loss": 0.0485,
+      "num_tokens": 105977048.0,
+      "reward": 0.3515625,
+      "reward_std": 0.27328038215637207,
+      "rewards/accuracy_reward/mean": 0.3515625,
+      "rewards/accuracy_reward/std": 0.4793342351913452,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000159740447998,
+      "sampling/importance_sampling_ratio/min": 0.00104097044095397,
+      "sampling/sampling_logp_difference/max": 6.8676018714904785,
+      "sampling/sampling_logp_difference/mean": 0.018304405733942986,
+      "step": 140
+    },
+    {
+      "clip_ratio/high_max": 1.6989023606583942e-05,
+      "clip_ratio/high_mean": 4.2472559016459854e-06,
+      "clip_ratio/low_mean": 2.3075059743860038e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.7322315418132348e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 16104.0,
+      "completions/max_terminated_length": 16104.0,
+      "completions/mean_length": 6230.5234375,
+      "completions/mean_terminated_length": 6230.5234375,
+      "completions/min_length": 220.0,
+      "completions/min_terminated_length": 220.0,
+      "entropy": 0.9658062160015106,
+      "epoch": 0.1297148114075437,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.002542720176279545,
+      "learning_rate": 1e-05,
+      "loss": 0.0725,
+      "num_tokens": 106793187.0,
+      "reward": 0.3203125,
+      "reward_std": 0.3050953149795532,
+      "rewards/accuracy_reward/mean": 0.3203125,
+      "rewards/accuracy_reward/std": 0.4684300124645233,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000169277191162,
+      "sampling/importance_sampling_ratio/min": 0.0002781494113150984,
+      "sampling/sampling_logp_difference/max": 8.187352180480957,
+      "sampling/sampling_logp_difference/mean": 0.019391046836972237,
+      "step": 141
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 2.7597974508353218e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.7597974508353218e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14216.0,
+      "completions/mean_length": 5690.5546875,
+      "completions/mean_terminated_length": 5606.3544921875,
+      "completions/min_length": 1124.0,
+      "completions/min_terminated_length": 1124.0,
+      "entropy": 1.0098655670881271,
+      "epoch": 0.13063477460901565,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.001451602904126048,
+      "learning_rate": 1e-05,
+      "loss": 0.0444,
+      "num_tokens": 107539874.0,
+      "reward": 0.4296875,
+      "reward_std": 0.23304283618927002,
+      "rewards/accuracy_reward/mean": 0.4296875,
+      "rewards/accuracy_reward/std": 0.4969765841960907,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999307990074158,
+      "sampling/importance_sampling_ratio/min": 5.640022671116185e-09,
+      "sampling/sampling_logp_difference/max": 18.993377685546875,
+      "sampling/sampling_logp_difference/mean": 0.018607191741466522,
+      "step": 142
+    },
+    {
+      "clip_ratio/high_max": 1.2800467629858758e-05,
+      "clip_ratio/high_mean": 4.19954119479371e-06,
+      "clip_ratio/low_mean": 2.350350996493944e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.770305115973315e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15791.0,
+      "completions/max_terminated_length": 15791.0,
+      "completions/mean_length": 5471.1328125,
+      "completions/mean_terminated_length": 5471.1328125,
+      "completions/min_length": 4.0,
+      "completions/min_terminated_length": 4.0,
+      "entropy": 1.0413162112236023,
+      "epoch": 0.13155473781048757,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0023549250327050686,
+      "learning_rate": 1e-05,
+      "loss": 0.0411,
+      "num_tokens": 108260091.0,
+      "reward": 0.3203125,
+      "reward_std": 0.30061954259872437,
+      "rewards/accuracy_reward/mean": 0.3203125,
+      "rewards/accuracy_reward/std": 0.4684300124645233,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999832510948181,
+      "sampling/importance_sampling_ratio/min": 0.0011709182290360332,
+      "sampling/sampling_logp_difference/max": 6.749967098236084,
+      "sampling/sampling_logp_difference/mean": 0.020427243784070015,
+      "step": 143
+    },
+    {
+      "clip_ratio/high_max": 2.1983064925734652e-05,
+      "clip_ratio/high_mean": 5.495766231433663e-06,
+      "clip_ratio/low_mean": 4.361141452591255e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.9107180757346214e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16277.0,
+      "completions/mean_length": 6211.7421875,
+      "completions/mean_terminated_length": 6050.2783203125,
+      "completions/min_length": 622.0,
+      "completions/min_terminated_length": 622.0,
+      "entropy": 0.9706784337759018,
+      "epoch": 0.13247470101195952,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0017527056625112891,
+      "learning_rate": 1e-05,
+      "loss": 0.0686,
+      "num_tokens": 109073890.0,
+      "reward": 0.421875,
+      "reward_std": 0.29826050996780396,
+      "rewards/accuracy_reward/mean": 0.421875,
+      "rewards/accuracy_reward/std": 0.4957992732524872,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999092221260071,
+      "sampling/importance_sampling_ratio/min": 0.002898645820096135,
+      "sampling/sampling_logp_difference/max": 5.843511581420898,
+      "sampling/sampling_logp_difference/mean": 0.018898162990808487,
+      "step": 144
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 4.208964992358233e-05,
+      "clip_ratio/low_min": 3.9168990042526275e-06,
+      "clip_ratio/region_mean": 4.208964992358233e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14880.0,
+      "completions/mean_length": 6007.8984375,
+      "completions/mean_terminated_length": 5926.19677734375,
+      "completions/min_length": 156.0,
+      "completions/min_terminated_length": 156.0,
+      "entropy": 1.1967609524726868,
+      "epoch": 0.13339466421343146,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0007858420140109956,
+      "learning_rate": 1e-05,
+      "loss": 0.011,
+      "num_tokens": 109861813.0,
+      "reward": 0.296875,
+      "reward_std": 0.23486506938934326,
+      "rewards/accuracy_reward/mean": 0.296875,
+      "rewards/accuracy_reward/std": 0.45867621898651123,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999340772628784,
+      "sampling/importance_sampling_ratio/min": 3.294382011631569e-08,
+      "sampling/sampling_logp_difference/max": 17.22846221923828,
+      "sampling/sampling_logp_difference/mean": 0.021845955401659012,
+      "step": 145
+    },
+    {
+      "clip_ratio/high_max": 4.5118208618077915e-06,
+      "clip_ratio/high_mean": 1.1279552154519479e-06,
+      "clip_ratio/low_mean": 3.749712686840212e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.8625082197540905e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15838.0,
+      "completions/mean_length": 6800.9921875,
+      "completions/mean_terminated_length": 6725.53564453125,
+      "completions/min_length": 5.0,
+      "completions/min_terminated_length": 5.0,
+      "entropy": 1.0437887012958527,
+      "epoch": 0.1343146274149034,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0029428249690681696,
+      "learning_rate": 1e-05,
+      "loss": 0.0405,
+      "num_tokens": 110756572.0,
+      "reward": 0.265625,
+      "reward_std": 0.3248382806777954,
+      "rewards/accuracy_reward/mean": 0.265625,
+      "rewards/accuracy_reward/std": 0.44340085983276367,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999890327453613,
+      "sampling/importance_sampling_ratio/min": 0.0006329434108920395,
+      "sampling/sampling_logp_difference/max": 7.365129470825195,
+      "sampling/sampling_logp_difference/mean": 0.02010120078921318,
+      "step": 146
+    },
+    {
+      "clip_ratio/high_max": 1.427700522071973e-05,
+      "clip_ratio/high_mean": 3.5692513051799324e-06,
+      "clip_ratio/low_mean": 4.964020990883e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.320946092979284e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15834.0,
+      "completions/mean_length": 6309.4453125,
+      "completions/mean_terminated_length": 6230.1181640625,
+      "completions/min_length": 283.0,
+      "completions/min_terminated_length": 283.0,
+      "entropy": 0.9768906533718109,
+      "epoch": 0.13523459061637536,
+      "frac_reward_zero_std": 0.125,
+      "grad_norm": 0.002088683657348156,
+      "learning_rate": 1e-05,
+      "loss": 0.0316,
+      "num_tokens": 111585493.0,
+      "reward": 0.375,
+      "reward_std": 0.39796435832977295,
+      "rewards/accuracy_reward/mean": 0.375,
+      "rewards/accuracy_reward/std": 0.4860251843929291,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.000007152557373,
+      "sampling/importance_sampling_ratio/min": 0.009723234921693802,
+      "sampling/sampling_logp_difference/max": 4.633236885070801,
+      "sampling/sampling_logp_difference/mean": 0.020927833393216133,
+      "step": 147
+    },
+    {
+      "clip_ratio/high_max": 5.4841398196003865e-06,
+      "clip_ratio/high_mean": 1.3710349549000966e-06,
+      "clip_ratio/low_mean": 5.122006064084417e-05,
+      "clip_ratio/low_min": 3.785125954891555e-06,
+      "clip_ratio/region_mean": 5.25910957094311e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15209.0,
+      "completions/mean_length": 6221.859375,
+      "completions/mean_terminated_length": 6060.5556640625,
+      "completions/min_length": 83.0,
+      "completions/min_terminated_length": 83.0,
+      "entropy": 0.9212924689054489,
+      "epoch": 0.13615455381784727,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.002406956860795617,
+      "learning_rate": 1e-05,
+      "loss": 0.1051,
+      "num_tokens": 112400363.0,
+      "reward": 0.40625,
+      "reward_std": 0.31929677724838257,
+      "rewards/accuracy_reward/mean": 0.40625,
+      "rewards/accuracy_reward/std": 0.4930621087551117,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999701976776123,
+      "sampling/importance_sampling_ratio/min": 5.8308287407271564e-05,
+      "sampling/sampling_logp_difference/max": 9.74976634979248,
+      "sampling/sampling_logp_difference/mean": 0.018652018159627914,
+      "step": 148
+    },
+    {
+      "clip_ratio/high_max": 1.4568151755156578e-05,
+      "clip_ratio/high_mean": 3.6420379387891444e-06,
+      "clip_ratio/low_mean": 3.999794398623635e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.3639981413434725e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14997.0,
+      "completions/mean_length": 6942.8203125,
+      "completions/mean_terminated_length": 6716.232421875,
+      "completions/min_length": 200.0,
+      "completions/min_terminated_length": 200.0,
+      "entropy": 0.949538916349411,
+      "epoch": 0.13707451701931922,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0022962254006415606,
+      "learning_rate": 1e-05,
+      "loss": 0.0625,
+      "num_tokens": 113308748.0,
+      "reward": 0.375,
+      "reward_std": 0.3329663872718811,
+      "rewards/accuracy_reward/mean": 0.375,
+      "rewards/accuracy_reward/std": 0.4860251843929291,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999334812164307,
+      "sampling/importance_sampling_ratio/min": 0.00048810525913722813,
+      "sampling/sampling_logp_difference/max": 7.624979496002197,
+      "sampling/sampling_logp_difference/mean": 0.01939917355775833,
+      "step": 149
+    },
+    {
+      "clip_ratio/high_max": 8.786732450971613e-06,
+      "clip_ratio/high_mean": 2.196683112742903e-06,
+      "clip_ratio/low_mean": 5.562954720517155e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.7826231113722315e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15182.0,
+      "completions/mean_length": 6783.1796875,
+      "completions/mean_terminated_length": 6552.76025390625,
+      "completions/min_length": 7.0,
+      "completions/min_terminated_length": 7.0,
+      "entropy": 0.9774708449840546,
+      "epoch": 0.13799448022079117,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0020560629200190306,
+      "learning_rate": 1e-05,
+      "loss": 0.0473,
+      "num_tokens": 114196235.0,
+      "reward": 0.34375,
+      "reward_std": 0.3066929578781128,
+      "rewards/accuracy_reward/mean": 0.34375,
+      "rewards/accuracy_reward/std": 0.47682511806488037,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998990297317505,
+      "sampling/importance_sampling_ratio/min": 2.4757892447269114e-07,
+      "sampling/sampling_logp_difference/max": 15.211536407470703,
+      "sampling/sampling_logp_difference/mean": 0.019691556692123413,
+      "step": 150
+    },
+    {
+      "clip_ratio/high_max": 1.799483243303257e-05,
+      "clip_ratio/high_mean": 4.498708108258143e-06,
+      "clip_ratio/low_mean": 2.6389980291696702e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.0888688343111426e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15549.0,
+      "completions/mean_length": 5568.15625,
+      "completions/mean_terminated_length": 5396.4765625,
+      "completions/min_length": 271.0,
+      "completions/min_terminated_length": 271.0,
+      "entropy": 0.9303529411554337,
+      "epoch": 0.1389144434222631,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0022214846685528755,
+      "learning_rate": 1e-05,
+      "loss": 0.0187,
+      "num_tokens": 114928047.0,
+      "reward": 0.234375,
+      "reward_std": 0.2585597634315491,
+      "rewards/accuracy_reward/mean": 0.234375,
+      "rewards/accuracy_reward/std": 0.42527204751968384,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999408721923828,
+      "sampling/importance_sampling_ratio/min": 2.1446083337650634e-05,
+      "sampling/sampling_logp_difference/max": 10.749968528747559,
+      "sampling/sampling_logp_difference/mean": 0.01938418298959732,
+      "step": 151
+    },
+    {
+      "clip_ratio/high_max": 1.1957493370573502e-05,
+      "clip_ratio/high_mean": 2.9893733426433755e-06,
+      "clip_ratio/low_mean": 5.885063319510664e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 6.184000585562899e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15340.0,
+      "completions/max_terminated_length": 15340.0,
+      "completions/mean_length": 6086.578125,
+      "completions/mean_terminated_length": 6086.578125,
+      "completions/min_length": 919.0,
+      "completions/min_terminated_length": 919.0,
+      "entropy": 0.9131873697042465,
+      "epoch": 0.13983440662373506,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.002448044717311859,
+      "learning_rate": 1e-05,
+      "loss": 0.0599,
+      "num_tokens": 115725657.0,
+      "reward": 0.40625,
+      "reward_std": 0.35878273844718933,
+      "rewards/accuracy_reward/mean": 0.40625,
+      "rewards/accuracy_reward/std": 0.4930621087551117,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999779462814331,
+      "sampling/importance_sampling_ratio/min": 0.02929726243019104,
+      "sampling/sampling_logp_difference/max": 3.530261278152466,
+      "sampling/sampling_logp_difference/mean": 0.019298439845442772,
+      "step": 152
+    },
+    {
+      "clip_ratio/high_max": 1.3385357760853367e-05,
+      "clip_ratio/high_mean": 3.3463394402133417e-06,
+      "clip_ratio/low_mean": 5.717015119444113e-05,
+      "clip_ratio/low_min": 3.4328400033700746e-06,
+      "clip_ratio/region_mean": 6.0516490520967636e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15987.0,
+      "completions/mean_length": 6442.5390625,
+      "completions/mean_terminated_length": 6203.9443359375,
+      "completions/min_length": 574.0,
+      "completions/min_terminated_length": 574.0,
+      "entropy": 0.8959419652819633,
+      "epoch": 0.140754369825207,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.002013204852119088,
+      "learning_rate": 1e-05,
+      "loss": 0.0281,
+      "num_tokens": 116571478.0,
+      "reward": 0.2734375,
+      "reward_std": 0.2909066081047058,
+      "rewards/accuracy_reward/mean": 0.2734375,
+      "rewards/accuracy_reward/std": 0.447474867105484,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.000044584274292,
+      "sampling/importance_sampling_ratio/min": 1.0374163821325055e-06,
+      "sampling/sampling_logp_difference/max": 13.778777122497559,
+      "sampling/sampling_logp_difference/mean": 0.01925014518201351,
+      "step": 153
+    },
+    {
+      "clip_ratio/high_max": 9.34224021875707e-06,
+      "clip_ratio/high_mean": 3.136903728773177e-06,
+      "clip_ratio/low_mean": 2.9738095065567904e-05,
+      "clip_ratio/low_min": 3.7240065466903616e-06,
+      "clip_ratio/region_mean": 3.2874999135401595e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15946.0,
+      "completions/mean_length": 6633.5703125,
+      "completions/mean_terminated_length": 6319.0400390625,
+      "completions/min_length": 358.0,
+      "completions/min_terminated_length": 358.0,
+      "entropy": 1.0223619118332863,
+      "epoch": 0.14167433302667892,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0024523327592760324,
+      "learning_rate": 1e-05,
+      "loss": 0.056,
+      "num_tokens": 117440743.0,
+      "reward": 0.3203125,
+      "reward_std": 0.30061954259872437,
+      "rewards/accuracy_reward/mean": 0.3203125,
+      "rewards/accuracy_reward/std": 0.4684300124645233,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999213218688965,
+      "sampling/importance_sampling_ratio/min": 3.0026931199245155e-05,
+      "sampling/sampling_logp_difference/max": 10.413415908813477,
+      "sampling/sampling_logp_difference/mean": 0.02061290666460991,
+      "step": 154
+    },
+    {
+      "clip_ratio/high_max": 1.4537483366439119e-05,
+      "clip_ratio/high_mean": 3.6343708416097797e-06,
+      "clip_ratio/low_mean": 3.954866042477079e-05,
+      "clip_ratio/low_min": 9.874949228105834e-06,
+      "clip_ratio/region_mean": 4.318303126638057e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15919.0,
+      "completions/mean_length": 7183.0,
+      "completions/mean_terminated_length": 6886.193359375,
+      "completions/min_length": 357.0,
+      "completions/min_terminated_length": 357.0,
+      "entropy": 0.9815369099378586,
+      "epoch": 0.14259429622815087,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0018688985146582127,
+      "learning_rate": 1e-05,
+      "loss": 0.0395,
+      "num_tokens": 118380687.0,
+      "reward": 0.2890625,
+      "reward_std": 0.2498900145292282,
+      "rewards/accuracy_reward/mean": 0.2890625,
+      "rewards/accuracy_reward/std": 0.45510825514793396,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999039173126221,
+      "sampling/importance_sampling_ratio/min": 1.3847662557964213e-05,
+      "sampling/sampling_logp_difference/max": 11.187394142150879,
+      "sampling/sampling_logp_difference/mean": 0.019792160019278526,
+      "step": 155
+    },
+    {
+      "clip_ratio/high_max": 7.165636361605721e-06,
+      "clip_ratio/high_mean": 1.7914090904014301e-06,
+      "clip_ratio/low_mean": 4.9011068711024564e-05,
+      "clip_ratio/low_min": 1.0991705721608014e-05,
+      "clip_ratio/region_mean": 5.0802477687739156e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16246.0,
+      "completions/mean_length": 6324.640625,
+      "completions/mean_terminated_length": 5829.91748046875,
+      "completions/min_length": 45.0,
+      "completions/min_terminated_length": 45.0,
+      "entropy": 0.852975606918335,
+      "epoch": 0.14351425942962281,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.002005894435569644,
+      "learning_rate": 1e-05,
+      "loss": 0.0041,
+      "num_tokens": 119207089.0,
+      "reward": 0.3984375,
+      "reward_std": 0.2858940362930298,
+      "rewards/accuracy_reward/mean": 0.3984375,
+      "rewards/accuracy_reward/std": 0.4915000796318054,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000035762786865,
+      "sampling/importance_sampling_ratio/min": 5.788659223071591e-07,
+      "sampling/sampling_logp_difference/max": 14.362195014953613,
+      "sampling/sampling_logp_difference/mean": 0.01853565312922001,
+      "step": 156
+    },
+    {
+      "clip_ratio/high_max": 7.795394822096569e-06,
+      "clip_ratio/high_mean": 1.948848705524142e-06,
+      "clip_ratio/low_mean": 3.834237736555224e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.0291225786859286e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16281.0,
+      "completions/mean_length": 5723.421875,
+      "completions/mean_terminated_length": 5290.06494140625,
+      "completions/min_length": 355.0,
+      "completions/min_terminated_length": 355.0,
+      "entropy": 0.8744911625981331,
+      "epoch": 0.14443422263109476,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.002577397273853421,
+      "learning_rate": 1e-05,
+      "loss": 0.0603,
+      "num_tokens": 119961895.0,
+      "reward": 0.390625,
+      "reward_std": 0.34321609139442444,
+      "rewards/accuracy_reward/mean": 0.390625,
+      "rewards/accuracy_reward/std": 0.4898075461387634,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999703764915466,
+      "sampling/importance_sampling_ratio/min": 0.07882421463727951,
+      "sampling/sampling_logp_difference/max": 2.5405349731445312,
+      "sampling/sampling_logp_difference/mean": 0.018341556191444397,
+      "step": 157
+    },
+    {
+      "clip_ratio/high_max": 9.214097190124448e-06,
+      "clip_ratio/high_mean": 2.303524297531112e-06,
+      "clip_ratio/low_mean": 2.636873176697918e-05,
+      "clip_ratio/low_min": 2.9339967113628518e-06,
+      "clip_ratio/region_mean": 2.8672255837136618e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16055.0,
+      "completions/mean_length": 7886.015625,
+      "completions/mean_terminated_length": 7682.064453125,
+      "completions/min_length": 989.0,
+      "completions/min_terminated_length": 989.0,
+      "entropy": 0.9391767829656601,
+      "epoch": 0.1453541858325667,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.002552987542003393,
+      "learning_rate": 1e-05,
+      "loss": 0.0273,
+      "num_tokens": 120990289.0,
+      "reward": 0.328125,
+      "reward_std": 0.2522490322589874,
+      "rewards/accuracy_reward/mean": 0.328125,
+      "rewards/accuracy_reward/std": 0.4713755249977112,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000030994415283,
+      "sampling/importance_sampling_ratio/min": 0.000899312668479979,
+      "sampling/sampling_logp_difference/max": 7.013879776000977,
+      "sampling/sampling_logp_difference/mean": 0.02049873024225235,
+      "step": 158
+    },
+    {
+      "clip_ratio/high_max": 3.406416203688423e-05,
+      "clip_ratio/high_mean": 9.72330332160709e-06,
+      "clip_ratio/low_mean": 3.168332909808669e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.140663151019908e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16276.0,
+      "completions/mean_length": 6173.1640625,
+      "completions/mean_terminated_length": 6011.087890625,
+      "completions/min_length": 445.0,
+      "completions/min_terminated_length": 445.0,
+      "entropy": 0.9148785546422005,
+      "epoch": 0.14627414903403863,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.002678362652659416,
+      "learning_rate": 1e-05,
+      "loss": 0.039,
+      "num_tokens": 121797958.0,
+      "reward": 0.4140625,
+      "reward_std": 0.3608373999595642,
+      "rewards/accuracy_reward/mean": 0.4140625,
+      "rewards/accuracy_reward/std": 0.49449479579925537,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999265074729919,
+      "sampling/importance_sampling_ratio/min": 0.002013920107856393,
+      "sampling/sampling_logp_difference/max": 6.207672119140625,
+      "sampling/sampling_logp_difference/mean": 0.018977735191583633,
+      "step": 159
+    },
+    {
+      "clip_ratio/high_max": 1.8476588593330234e-05,
+      "clip_ratio/high_mean": 4.6191471483325586e-06,
+      "clip_ratio/low_mean": 4.459614581264759e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.9215293188353826e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15825.0,
+      "completions/mean_length": 6594.21875,
+      "completions/mean_terminated_length": 6196.259765625,
+      "completions/min_length": 196.0,
+      "completions/min_terminated_length": 196.0,
+      "entropy": 0.9486038386821747,
+      "epoch": 0.14719411223551057,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0033711253199726343,
+      "learning_rate": 1e-05,
+      "loss": 0.026,
+      "num_tokens": 122661170.0,
+      "reward": 0.3828125,
+      "reward_std": 0.30457615852355957,
+      "rewards/accuracy_reward/mean": 0.3828125,
+      "rewards/accuracy_reward/std": 0.4879830479621887,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998981356620789,
+      "sampling/importance_sampling_ratio/min": 0.0002968419576063752,
+      "sampling/sampling_logp_difference/max": 8.122310638427734,
+      "sampling/sampling_logp_difference/mean": 0.01938377134501934,
+      "step": 160
+    },
+    {
+      "clip_ratio/high_max": 7.97335997049231e-06,
+      "clip_ratio/high_mean": 2.7343705824023345e-06,
+      "clip_ratio/low_mean": 5.420079878604156e-05,
+      "clip_ratio/low_min": 4.594068286678521e-06,
+      "clip_ratio/region_mean": 5.693517005056492e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15928.0,
+      "completions/mean_length": 6533.9453125,
+      "completions/mean_terminated_length": 6377.595703125,
+      "completions/min_length": 524.0,
+      "completions/min_terminated_length": 524.0,
+      "entropy": 0.9986584335565567,
+      "epoch": 0.14811407543698252,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0017857529455795884,
+      "learning_rate": 1e-05,
+      "loss": 0.0804,
+      "num_tokens": 123518107.0,
+      "reward": 0.34375,
+      "reward_std": 0.3356248140335083,
+      "rewards/accuracy_reward/mean": 0.34375,
+      "rewards/accuracy_reward/std": 0.47682511806488037,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998549818992615,
+      "sampling/importance_sampling_ratio/min": 9.012701411847956e-06,
+      "sampling/sampling_logp_difference/max": 11.616875648498535,
+      "sampling/sampling_logp_difference/mean": 0.02010391652584076,
+      "step": 161
+    },
+    {
+      "clip_ratio/high_max": 4.470512521947967e-06,
+      "clip_ratio/high_mean": 1.1176281304869917e-06,
+      "clip_ratio/low_mean": 3.5141094485879876e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.625872295742738e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 13212.0,
+      "completions/mean_length": 5742.21875,
+      "completions/mean_terminated_length": 5658.42529296875,
+      "completions/min_length": 235.0,
+      "completions/min_terminated_length": 235.0,
+      "entropy": 1.0379670709371567,
+      "epoch": 0.14903403863845446,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0018227624241262674,
+      "learning_rate": 1e-05,
+      "loss": -0.0237,
+      "num_tokens": 124279031.0,
+      "reward": 0.21875,
+      "reward_std": 0.2869548797607422,
+      "rewards/accuracy_reward/mean": 0.21875,
+      "rewards/accuracy_reward/std": 0.41502299904823303,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998506903648376,
+      "sampling/importance_sampling_ratio/min": 0.0020977305248379707,
+      "sampling/sampling_logp_difference/max": 6.16689920425415,
+      "sampling/sampling_logp_difference/mean": 0.019987668842077255,
+      "step": 162
+    },
+    {
+      "clip_ratio/high_max": 1.0003542683989508e-05,
+      "clip_ratio/high_mean": 3.21091931709816e-06,
+      "clip_ratio/low_mean": 5.731009014198207e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 6.0521009800140746e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16354.0,
+      "completions/mean_length": 7584.703125,
+      "completions/mean_terminated_length": 7515.41748046875,
+      "completions/min_length": 884.0,
+      "completions/min_terminated_length": 884.0,
+      "entropy": 0.953459307551384,
+      "epoch": 0.1499540018399264,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.002219022251665592,
+      "learning_rate": 1e-05,
+      "loss": 0.0837,
+      "num_tokens": 125270761.0,
+      "reward": 0.359375,
+      "reward_std": 0.37033066153526306,
+      "rewards/accuracy_reward/mean": 0.359375,
+      "rewards/accuracy_reward/std": 0.481702595949173,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999880790710449,
+      "sampling/importance_sampling_ratio/min": 0.0024849213659763336,
+      "sampling/sampling_logp_difference/max": 5.997514247894287,
+      "sampling/sampling_logp_difference/mean": 0.020291510969400406,
+      "step": 163
+    },
+    {
+      "clip_ratio/high_max": 7.734669452474918e-06,
+      "clip_ratio/high_mean": 1.9336673631187296e-06,
+      "clip_ratio/low_mean": 3.1135301298945706e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.3068968605221016e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16133.0,
+      "completions/mean_length": 4714.671875,
+      "completions/mean_terminated_length": 4622.78759765625,
+      "completions/min_length": 371.0,
+      "completions/min_terminated_length": 371.0,
+      "entropy": 1.018719919025898,
+      "epoch": 0.15087396504139836,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0014189074281603098,
+      "learning_rate": 1e-05,
+      "loss": 0.0501,
+      "num_tokens": 125895279.0,
+      "reward": 0.3984375,
+      "reward_std": 0.28383445739746094,
+      "rewards/accuracy_reward/mean": 0.3984375,
+      "rewards/accuracy_reward/std": 0.4915000796318054,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999479651451111,
+      "sampling/importance_sampling_ratio/min": 4.017410901724361e-07,
+      "sampling/sampling_logp_difference/max": 14.727458000183105,
+      "sampling/sampling_logp_difference/mean": 0.018739396706223488,
+      "step": 164
+    },
+    {
+      "clip_ratio/high_max": 1.0069575182569679e-05,
+      "clip_ratio/high_mean": 2.5173937956424197e-06,
+      "clip_ratio/low_mean": 3.824179225375701e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.0759185367278405e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15913.0,
+      "completions/mean_length": 6316.140625,
+      "completions/mean_terminated_length": 6074.51220703125,
+      "completions/min_length": 751.0,
+      "completions/min_terminated_length": 751.0,
+      "entropy": 0.9325072392821312,
+      "epoch": 0.15179392824287027,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.001702460227534175,
+      "learning_rate": 1e-05,
+      "loss": 0.1007,
+      "num_tokens": 126722881.0,
+      "reward": 0.4609375,
+      "reward_std": 0.3056321144104004,
+      "rewards/accuracy_reward/mean": 0.4609375,
+      "rewards/accuracy_reward/std": 0.5004304051399231,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999539852142334,
+      "sampling/importance_sampling_ratio/min": 0.0012551364488899708,
+      "sampling/sampling_logp_difference/max": 6.680510997772217,
+      "sampling/sampling_logp_difference/mean": 0.01929408684372902,
+      "step": 165
+    },
+    {
+      "clip_ratio/high_max": 6.873041002108948e-06,
+      "clip_ratio/high_mean": 1.718260250527237e-06,
+      "clip_ratio/low_mean": 3.119859468370123e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.291685527528898e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15832.0,
+      "completions/mean_length": 4687.140625,
+      "completions/mean_terminated_length": 4595.03955078125,
+      "completions/min_length": 310.0,
+      "completions/min_terminated_length": 310.0,
+      "entropy": 1.0886607319116592,
+      "epoch": 0.15271389144434222,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0032931750174611807,
+      "learning_rate": 1e-05,
+      "loss": 0.0078,
+      "num_tokens": 127341715.0,
+      "reward": 0.28125,
+      "reward_std": 0.2835350036621094,
+      "rewards/accuracy_reward/mean": 0.28125,
+      "rewards/accuracy_reward/std": 0.4513758420944214,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999821186065674,
+      "sampling/importance_sampling_ratio/min": 0.0019364450126886368,
+      "sampling/sampling_logp_difference/max": 6.246901512145996,
+      "sampling/sampling_logp_difference/mean": 0.020621225237846375,
+      "step": 166
+    },
+    {
+      "clip_ratio/high_max": 1.773085250533768e-05,
+      "clip_ratio/high_mean": 4.43271312633442e-06,
+      "clip_ratio/low_mean": 4.30743207289197e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.7507033741567284e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14125.0,
+      "completions/mean_length": 5705.515625,
+      "completions/mean_terminated_length": 5449.232421875,
+      "completions/min_length": 4.0,
+      "completions/min_terminated_length": 4.0,
+      "entropy": 1.0523068830370903,
+      "epoch": 0.15363385464581417,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0031696646474301815,
+      "learning_rate": 1e-05,
+      "loss": -0.0414,
+      "num_tokens": 128093597.0,
+      "reward": 0.1953125,
+      "reward_std": 0.21778053045272827,
+      "rewards/accuracy_reward/mean": 0.1953125,
+      "rewards/accuracy_reward/std": 0.3979988098144531,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999619126319885,
+      "sampling/importance_sampling_ratio/min": 3.197810656274669e-05,
+      "sampling/sampling_logp_difference/max": 10.350459098815918,
+      "sampling/sampling_logp_difference/mean": 0.021961934864521027,
+      "step": 167
+    },
+    {
+      "clip_ratio/high_max": 1.885905066956184e-05,
+      "clip_ratio/high_mean": 4.71476266739046e-06,
+      "clip_ratio/low_mean": 5.0530389898995054e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.524515336219338e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15958.0,
+      "completions/mean_length": 6214.4921875,
+      "completions/mean_terminated_length": 6053.07177734375,
+      "completions/min_length": 533.0,
+      "completions/min_terminated_length": 533.0,
+      "entropy": 0.9371421113610268,
+      "epoch": 0.1545538178472861,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0023704832419753075,
+      "learning_rate": 1e-05,
+      "loss": 0.075,
+      "num_tokens": 128906948.0,
+      "reward": 0.40625,
+      "reward_std": 0.34139877557754517,
+      "rewards/accuracy_reward/mean": 0.40625,
+      "rewards/accuracy_reward/std": 0.4930621087551117,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.000023365020752,
+      "sampling/importance_sampling_ratio/min": 0.0003354824730195105,
+      "sampling/sampling_logp_difference/max": 7.999940872192383,
+      "sampling/sampling_logp_difference/mean": 0.01882763020694256,
+      "step": 168
+    },
+    {
+      "clip_ratio/high_max": 3.042072216885572e-05,
+      "clip_ratio/high_mean": 7.60518054221393e-06,
+      "clip_ratio/low_mean": 4.5897569179942366e-05,
+      "clip_ratio/low_min": 8.727477506909054e-06,
+      "clip_ratio/region_mean": 5.3502750233747065e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15865.0,
+      "completions/mean_length": 7127.0703125,
+      "completions/mean_terminated_length": 7054.18115234375,
+      "completions/min_length": 402.0,
+      "completions/min_terminated_length": 402.0,
+      "entropy": 0.9854387491941452,
+      "epoch": 0.15547378104875806,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.003370177699252963,
+      "learning_rate": 1e-05,
+      "loss": 0.1197,
+      "num_tokens": 129839813.0,
+      "reward": 0.359375,
+      "reward_std": 0.3329663574695587,
+      "rewards/accuracy_reward/mean": 0.359375,
+      "rewards/accuracy_reward/std": 0.481702595949173,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999907910823822,
+      "sampling/importance_sampling_ratio/min": 1.077816432371037e-05,
+      "sampling/sampling_logp_difference/max": 11.43798828125,
+      "sampling/sampling_logp_difference/mean": 0.019736800342798233,
+      "step": 169
+    },
+    {
+      "clip_ratio/high_max": 2.1401074718596647e-05,
+      "clip_ratio/high_mean": 6.243764005375851e-06,
+      "clip_ratio/low_mean": 3.2797592325550795e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.904135610355297e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15865.0,
+      "completions/mean_length": 6566.2890625,
+      "completions/mean_terminated_length": 6330.6640625,
+      "completions/min_length": 969.0,
+      "completions/min_terminated_length": 969.0,
+      "entropy": 0.7978609576821327,
+      "epoch": 0.15639374425023,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0026055986527353525,
+      "learning_rate": 1e-05,
+      "loss": 0.0661,
+      "num_tokens": 130698370.0,
+      "reward": 0.5,
+      "reward_std": 0.36295419931411743,
+      "rewards/accuracy_reward/mean": 0.5,
+      "rewards/accuracy_reward/std": 0.5019646286964417,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999133944511414,
+      "sampling/importance_sampling_ratio/min": 0.00031152591691352427,
+      "sampling/sampling_logp_difference/max": 8.074028015136719,
+      "sampling/sampling_logp_difference/mean": 0.01787097379565239,
+      "step": 170
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 2.0564424403346493e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.0564424403346493e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15576.0,
+      "completions/max_terminated_length": 15576.0,
+      "completions/mean_length": 7186.2890625,
+      "completions/mean_terminated_length": 7186.2890625,
+      "completions/min_length": 351.0,
+      "completions/min_terminated_length": 351.0,
+      "entropy": 1.0232757329940796,
+      "epoch": 0.15731370745170192,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.0023866184055805206,
+      "learning_rate": 1e-05,
+      "loss": 0.0683,
+      "num_tokens": 131637439.0,
+      "reward": 0.2734375,
+      "reward_std": 0.2059282809495926,
+      "rewards/accuracy_reward/mean": 0.2734375,
+      "rewards/accuracy_reward/std": 0.447474867105484,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999207258224487,
+      "sampling/importance_sampling_ratio/min": 0.0007378471200354397,
+      "sampling/sampling_logp_difference/max": 7.211773872375488,
+      "sampling/sampling_logp_difference/mean": 0.02137116715312004,
+      "step": 171
+    },
+    {
+      "clip_ratio/high_max": 4.037900725961663e-05,
+      "clip_ratio/high_mean": 1.0094751814904157e-05,
+      "clip_ratio/low_mean": 5.8380828136250784e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 6.847557995115494e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 13638.0,
+      "completions/mean_length": 5591.5703125,
+      "completions/mean_terminated_length": 5420.26220703125,
+      "completions/min_length": 635.0,
+      "completions/min_terminated_length": 635.0,
+      "entropy": 0.9335208311676979,
+      "epoch": 0.15823367065317387,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.003491115989163518,
+      "learning_rate": 1e-05,
+      "loss": 0.0699,
+      "num_tokens": 132371816.0,
+      "reward": 0.5,
+      "reward_std": 0.3406373858451843,
+      "rewards/accuracy_reward/mean": 0.5,
+      "rewards/accuracy_reward/std": 0.5019646286964417,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999891459941864,
+      "sampling/importance_sampling_ratio/min": 0.00012356207298580557,
+      "sampling/sampling_logp_difference/max": 8.998766899108887,
+      "sampling/sampling_logp_difference/mean": 0.018760837614536285,
+      "step": 172
+    },
+    {
+      "clip_ratio/high_max": 2.8378776733006816e-06,
+      "clip_ratio/high_mean": 7.094694183251704e-07,
+      "clip_ratio/low_mean": 4.4085751369493664e-05,
+      "clip_ratio/low_min": 6.7955093072669115e-06,
+      "clip_ratio/region_mean": 4.4795220674132e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16302.0,
+      "completions/mean_length": 7152.3828125,
+      "completions/mean_terminated_length": 6930.82421875,
+      "completions/min_length": 281.0,
+      "completions/min_terminated_length": 281.0,
+      "entropy": 1.1329835206270218,
+      "epoch": 0.15915363385464582,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.002830669516697526,
+      "learning_rate": 1e-05,
+      "loss": 0.0526,
+      "num_tokens": 133307297.0,
+      "reward": 0.28125,
+      "reward_std": 0.28801077604293823,
+      "rewards/accuracy_reward/mean": 0.28125,
+      "rewards/accuracy_reward/std": 0.4513758420944214,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999501705169678,
+      "sampling/importance_sampling_ratio/min": 0.00028047082014381886,
+      "sampling/sampling_logp_difference/max": 8.179040908813477,
+      "sampling/sampling_logp_difference/mean": 0.021548541262745857,
+      "step": 173
+    },
+    {
+      "clip_ratio/high_max": 1.0150829439226072e-05,
+      "clip_ratio/high_mean": 2.537707359806518e-06,
+      "clip_ratio/low_mean": 3.4009618616437365e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.654732597624388e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15068.0,
+      "completions/mean_length": 7263.453125,
+      "completions/mean_terminated_length": 7118.68310546875,
+      "completions/min_length": 352.0,
+      "completions/min_terminated_length": 352.0,
+      "entropy": 1.092760555446148,
+      "epoch": 0.16007359705611776,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0027821618132293224,
+      "learning_rate": 1e-05,
+      "loss": 0.0541,
+      "num_tokens": 134260107.0,
+      "reward": 0.3203125,
+      "reward_std": 0.2858940362930298,
+      "rewards/accuracy_reward/mean": 0.3203125,
+      "rewards/accuracy_reward/std": 0.4684300124645233,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999946117401123,
+      "sampling/importance_sampling_ratio/min": 7.832317351130769e-05,
+      "sampling/sampling_logp_difference/max": 9.454667091369629,
+      "sampling/sampling_logp_difference/mean": 0.022098438814282417,
+      "step": 174
+    },
+    {
+      "clip_ratio/high_max": 1.0561876024439698e-05,
+      "clip_ratio/high_mean": 2.6404690061099245e-06,
+      "clip_ratio/low_mean": 1.6864279416495265e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 1.9504748649978865e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15388.0,
+      "completions/mean_length": 7088.8125,
+      "completions/mean_terminated_length": 6710.958984375,
+      "completions/min_length": 1314.0,
+      "completions/min_terminated_length": 1314.0,
+      "entropy": 1.0669445469975471,
+      "epoch": 0.1609935602575897,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.0007076738984324038,
+      "learning_rate": 1e-05,
+      "loss": -0.0197,
+      "num_tokens": 135186139.0,
+      "reward": 0.328125,
+      "reward_std": 0.20593319833278656,
+      "rewards/accuracy_reward/mean": 0.328125,
+      "rewards/accuracy_reward/std": 0.4713755249977112,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998199343681335,
+      "sampling/importance_sampling_ratio/min": 3.084653872065246e-05,
+      "sampling/sampling_logp_difference/max": 10.386486053466797,
+      "sampling/sampling_logp_difference/mean": 0.020075790584087372,
+      "step": 175
+    },
+    {
+      "clip_ratio/high_max": 7.095016371749807e-06,
+      "clip_ratio/high_mean": 1.7737540929374518e-06,
+      "clip_ratio/low_mean": 2.7592465016823553e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.936621888238733e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15626.0,
+      "completions/max_terminated_length": 15626.0,
+      "completions/mean_length": 5352.734375,
+      "completions/mean_terminated_length": 5352.734375,
+      "completions/min_length": 333.0,
+      "completions/min_terminated_length": 333.0,
+      "entropy": 1.0387161895632744,
+      "epoch": 0.16191352345906163,
+      "frac_reward_zero_std": 0.125,
+      "grad_norm": 0.0022445612121373415,
+      "learning_rate": 1e-05,
+      "loss": 0.0261,
+      "num_tokens": 135888929.0,
+      "reward": 0.4765625,
+      "reward_std": 0.399257630109787,
+      "rewards/accuracy_reward/mean": 0.4765625,
+      "rewards/accuracy_reward/std": 0.5014128684997559,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999054670333862,
+      "sampling/importance_sampling_ratio/min": 0.00032565294532105327,
+      "sampling/sampling_logp_difference/max": 8.029678344726562,
+      "sampling/sampling_logp_difference/mean": 0.02010166086256504,
+      "step": 176
+    },
+    {
+      "clip_ratio/high_max": 1.5100852124305675e-05,
+      "clip_ratio/high_mean": 4.426987970873597e-06,
+      "clip_ratio/low_mean": 2.7625993425317574e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.2052981168817496e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16266.0,
+      "completions/mean_length": 7758.90625,
+      "completions/mean_terminated_length": 7408.29248046875,
+      "completions/min_length": 742.0,
+      "completions/min_terminated_length": 742.0,
+      "entropy": 1.0648984238505363,
+      "epoch": 0.16283348666053357,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0022021254990249872,
+      "learning_rate": 1e-05,
+      "loss": 0.0621,
+      "num_tokens": 136901941.0,
+      "reward": 0.3671875,
+      "reward_std": 0.2914257347583771,
+      "rewards/accuracy_reward/mean": 0.3671875,
+      "rewards/accuracy_reward/std": 0.4839322865009308,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999858140945435,
+      "sampling/importance_sampling_ratio/min": 2.2461865967216e-07,
+      "sampling/sampling_logp_difference/max": 15.30886173248291,
+      "sampling/sampling_logp_difference/mean": 0.021426808089017868,
+      "step": 177
+    },
+    {
+      "clip_ratio/high_max": 2.5346608254039893e-05,
+      "clip_ratio/high_mean": 7.4063813144675805e-06,
+      "clip_ratio/low_mean": 2.2069365058996482e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.9475746259777225e-05,
+      "completions/clipped_ratio": 0.0546875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16277.0,
+      "completions/mean_length": 7036.953125,
+      "completions/mean_terminated_length": 6496.21484375,
+      "completions/min_length": 4.0,
+      "completions/min_terminated_length": 4.0,
+      "entropy": 0.9684997871518135,
+      "epoch": 0.16375344986200552,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0013461806811392307,
+      "learning_rate": 1e-05,
+      "loss": 0.035,
+      "num_tokens": 137824623.0,
+      "reward": 0.34375,
+      "reward_std": 0.2546031177043915,
+      "rewards/accuracy_reward/mean": 0.34375,
+      "rewards/accuracy_reward/std": 0.47682511806488037,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999944806098938,
+      "sampling/importance_sampling_ratio/min": 5.834372132085264e-05,
+      "sampling/sampling_logp_difference/max": 9.74915885925293,
+      "sampling/sampling_logp_difference/mean": 0.020304443314671516,
+      "step": 178
+    },
+    {
+      "clip_ratio/high_max": 1.3147734080121154e-05,
+      "clip_ratio/high_mean": 3.2869335200302885e-06,
+      "clip_ratio/low_mean": 4.841489999307669e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.170183294467279e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15500.0,
+      "completions/mean_length": 6114.1875,
+      "completions/mean_terminated_length": 5951.1748046875,
+      "completions/min_length": 223.0,
+      "completions/min_terminated_length": 223.0,
+      "entropy": 0.943072073161602,
+      "epoch": 0.16467341306347746,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.002132438588887453,
+      "learning_rate": 1e-05,
+      "loss": 0.0943,
+      "num_tokens": 138625247.0,
+      "reward": 0.40625,
+      "reward_std": 0.321650892496109,
+      "rewards/accuracy_reward/mean": 0.40625,
+      "rewards/accuracy_reward/std": 0.4930621087551117,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999298453330994,
+      "sampling/importance_sampling_ratio/min": 0.0017275095451623201,
+      "sampling/sampling_logp_difference/max": 6.361074447631836,
+      "sampling/sampling_logp_difference/mean": 0.020084267482161522,
+      "step": 179
+    },
+    {
+      "clip_ratio/high_max": 1.7873157958092634e-05,
+      "clip_ratio/high_mean": 4.468289489523158e-06,
+      "clip_ratio/low_mean": 3.5252990301160025e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.9721279790683184e-05,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15050.0,
+      "completions/mean_length": 7618.875,
+      "completions/mean_terminated_length": 7034.53369140625,
+      "completions/min_length": 1030.0,
+      "completions/min_terminated_length": 1030.0,
+      "entropy": 0.9142575263977051,
+      "epoch": 0.1655933762649494,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0026741649489849806,
+      "learning_rate": 1e-05,
+      "loss": 0.0666,
+      "num_tokens": 139619287.0,
+      "reward": 0.2890625,
+      "reward_std": 0.2927239239215851,
+      "rewards/accuracy_reward/mean": 0.2890625,
+      "rewards/accuracy_reward/std": 0.45510825514793396,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998897314071655,
+      "sampling/importance_sampling_ratio/min": 0.005949751473963261,
+      "sampling/sampling_logp_difference/max": 5.124405860900879,
+      "sampling/sampling_logp_difference/mean": 0.020061582326889038,
+      "step": 180
+    },
+    {
+      "clip_ratio/high_max": 1.0512151675357018e-05,
+      "clip_ratio/high_mean": 2.6280379188392544e-06,
+      "clip_ratio/low_mean": 4.5301517502593924e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.792955542143318e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 16106.0,
+      "completions/max_terminated_length": 16106.0,
+      "completions/mean_length": 5333.875,
+      "completions/mean_terminated_length": 5333.875,
+      "completions/min_length": 1109.0,
+      "completions/min_terminated_length": 1109.0,
+      "entropy": 0.8107482865452766,
+      "epoch": 0.16651333946642136,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0027016003150492907,
+      "learning_rate": 1e-05,
+      "loss": 0.0544,
+      "num_tokens": 140318935.0,
+      "reward": 0.5703125,
+      "reward_std": 0.2556639611721039,
+      "rewards/accuracy_reward/mean": 0.5703125,
+      "rewards/accuracy_reward/std": 0.4969765841960907,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.000013828277588,
+      "sampling/importance_sampling_ratio/min": 0.006856904830783606,
+      "sampling/sampling_logp_difference/max": 4.982499122619629,
+      "sampling/sampling_logp_difference/mean": 0.017069874331355095,
+      "step": 181
+    },
+    {
+      "clip_ratio/high_max": 1.85085939392593e-05,
+      "clip_ratio/high_mean": 5.24943533264377e-06,
+      "clip_ratio/low_mean": 5.6120721524166584e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 6.137015702734061e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16050.0,
+      "completions/mean_length": 7443.3046875,
+      "completions/mean_terminated_length": 7154.89501953125,
+      "completions/min_length": 19.0,
+      "completions/min_terminated_length": 19.0,
+      "entropy": 0.9224414080381393,
+      "epoch": 0.16743330266789327,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.002655779244378209,
+      "learning_rate": 1e-05,
+      "loss": 0.0466,
+      "num_tokens": 141293534.0,
+      "reward": 0.234375,
+      "reward_std": 0.2398776262998581,
+      "rewards/accuracy_reward/mean": 0.234375,
+      "rewards/accuracy_reward/std": 0.42527204751968384,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999659061431885,
+      "sampling/importance_sampling_ratio/min": 0.00042018835665658116,
+      "sampling/sampling_logp_difference/max": 7.774807453155518,
+      "sampling/sampling_logp_difference/mean": 0.02006504125893116,
+      "step": 182
+    },
+    {
+      "clip_ratio/high_max": 1.494229445597739e-05,
+      "clip_ratio/high_mean": 3.7355736139943474e-06,
+      "clip_ratio/low_mean": 2.2748562741981004e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.6484136355975352e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15923.0,
+      "completions/max_terminated_length": 15923.0,
+      "completions/mean_length": 5646.6875,
+      "completions/mean_terminated_length": 5646.6875,
+      "completions/min_length": 342.0,
+      "completions/min_terminated_length": 342.0,
+      "entropy": 0.8945339694619179,
+      "epoch": 0.16835326586936522,
+      "frac_reward_zero_std": 0.625,
+      "grad_norm": 0.0016281780553981662,
+      "learning_rate": 1e-05,
+      "loss": 0.0288,
+      "num_tokens": 142037438.0,
+      "reward": 0.46875,
+      "reward_std": 0.17912296950817108,
+      "rewards/accuracy_reward/mean": 0.46875,
+      "rewards/accuracy_reward/std": 0.5009832978248596,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.000030517578125,
+      "sampling/importance_sampling_ratio/min": 0.0005717006279155612,
+      "sampling/sampling_logp_difference/max": 7.46689510345459,
+      "sampling/sampling_logp_difference/mean": 0.019336247816681862,
+      "step": 183
+    },
+    {
+      "clip_ratio/high_max": 3.335990868436056e-05,
+      "clip_ratio/high_mean": 8.33997717109014e-06,
+      "clip_ratio/low_mean": 3.5050728683927446e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.339070608239126e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14142.0,
+      "completions/mean_length": 6384.640625,
+      "completions/mean_terminated_length": 5892.86865234375,
+      "completions/min_length": 130.0,
+      "completions/min_terminated_length": 130.0,
+      "entropy": 0.840093269944191,
+      "epoch": 0.16927322907083717,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.002166559686884284,
+      "learning_rate": 1e-05,
+      "loss": 0.0011,
+      "num_tokens": 142873848.0,
+      "reward": 0.4765625,
+      "reward_std": 0.35506346821784973,
+      "rewards/accuracy_reward/mean": 0.4765625,
+      "rewards/accuracy_reward/std": 0.5014128684997559,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000462532043457,
+      "sampling/importance_sampling_ratio/min": 4.785555574926548e-06,
+      "sampling/sampling_logp_difference/max": 12.249908447265625,
+      "sampling/sampling_logp_difference/mean": 0.018109092488884926,
+      "step": 184
+    },
+    {
+      "clip_ratio/high_max": 1.541105484648142e-05,
+      "clip_ratio/high_mean": 3.852763711620355e-06,
+      "clip_ratio/low_mean": 4.0552770769863855e-05,
+      "clip_ratio/low_min": 7.133888630050933e-06,
+      "clip_ratio/region_mean": 4.440553459517105e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14828.0,
+      "completions/mean_length": 5775.0,
+      "completions/mean_terminated_length": 5691.46435546875,
+      "completions/min_length": 1147.0,
+      "completions/min_terminated_length": 1147.0,
+      "entropy": 0.8915362879633904,
+      "epoch": 0.1701931922723091,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0021932912059128284,
+      "learning_rate": 1e-05,
+      "loss": -0.0086,
+      "num_tokens": 143636152.0,
+      "reward": 0.4375,
+      "reward_std": 0.32325342297554016,
+      "rewards/accuracy_reward/mean": 0.4375,
+      "rewards/accuracy_reward/std": 0.49802759289741516,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.000008225440979,
+      "sampling/importance_sampling_ratio/min": 9.714113069492214e-09,
+      "sampling/sampling_logp_difference/max": 18.44968605041504,
+      "sampling/sampling_logp_difference/mean": 0.019278086721897125,
+      "step": 185
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 2.7509142171311396e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.7509142171311396e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15122.0,
+      "completions/mean_length": 6181.640625,
+      "completions/mean_terminated_length": 6019.69873046875,
+      "completions/min_length": 473.0,
+      "completions/min_terminated_length": 473.0,
+      "entropy": 1.0544511675834656,
+      "epoch": 0.17111315547378106,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.0022947140969336033,
+      "learning_rate": 1e-05,
+      "loss": 0.0242,
+      "num_tokens": 144447370.0,
+      "reward": 0.234375,
+      "reward_std": 0.2022808939218521,
+      "rewards/accuracy_reward/mean": 0.234375,
+      "rewards/accuracy_reward/std": 0.42527204751968384,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999147653579712,
+      "sampling/importance_sampling_ratio/min": 7.419757253046555e-08,
+      "sampling/sampling_logp_difference/max": 16.416534423828125,
+      "sampling/sampling_logp_difference/mean": 0.02050788700580597,
+      "step": 186
+    },
+    {
+      "clip_ratio/high_max": 1.5700999938417226e-05,
+      "clip_ratio/high_mean": 3.9252499846043065e-06,
+      "clip_ratio/low_mean": 2.4595847037289786e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.8521096965050674e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15824.0,
+      "completions/mean_length": 6542.3046875,
+      "completions/mean_terminated_length": 6306.1044921875,
+      "completions/min_length": 628.0,
+      "completions/min_terminated_length": 628.0,
+      "entropy": 0.933225467801094,
+      "epoch": 0.17203311867525298,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0034910975955426693,
+      "learning_rate": 1e-05,
+      "loss": 0.0977,
+      "num_tokens": 145303505.0,
+      "reward": 0.390625,
+      "reward_std": 0.30433881282806396,
+      "rewards/accuracy_reward/mean": 0.390625,
+      "rewards/accuracy_reward/std": 0.4898075461387634,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999945163726807,
+      "sampling/importance_sampling_ratio/min": 0.007213745731860399,
+      "sampling/sampling_logp_difference/max": 4.931766986846924,
+      "sampling/sampling_logp_difference/mean": 0.020022759214043617,
+      "step": 187
+    },
+    {
+      "clip_ratio/high_max": 6.0999414017715026e-06,
+      "clip_ratio/high_mean": 1.5249853504428756e-06,
+      "clip_ratio/low_mean": 2.61421698724007e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.7667155109156738e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15365.0,
+      "completions/mean_length": 5889.4765625,
+      "completions/mean_terminated_length": 5637.6083984375,
+      "completions/min_length": 99.0,
+      "completions/min_terminated_length": 99.0,
+      "entropy": 0.9649673849344254,
+      "epoch": 0.17295308187672492,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0024078311398625374,
+      "learning_rate": 1e-05,
+      "loss": 0.0391,
+      "num_tokens": 146082198.0,
+      "reward": 0.3359375,
+      "reward_std": 0.2698703408241272,
+      "rewards/accuracy_reward/mean": 0.3359375,
+      "rewards/accuracy_reward/std": 0.47417303919792175,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999341368675232,
+      "sampling/importance_sampling_ratio/min": 0.0008680344326421618,
+      "sampling/sampling_logp_difference/max": 7.04927921295166,
+      "sampling/sampling_logp_difference/mean": 0.02060198038816452,
+      "step": 188
+    },
+    {
+      "clip_ratio/high_max": 7.789618393871933e-06,
+      "clip_ratio/high_mean": 1.9474045984679833e-06,
+      "clip_ratio/low_mean": 3.6395756637830345e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.834316100892465e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16233.0,
+      "completions/mean_length": 5349.2421875,
+      "completions/mean_terminated_length": 5084.408203125,
+      "completions/min_length": 678.0,
+      "completions/min_terminated_length": 678.0,
+      "entropy": 0.8402756005525589,
+      "epoch": 0.17387304507819687,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0021191861014813185,
+      "learning_rate": 1e-05,
+      "loss": 0.1275,
+      "num_tokens": 146786245.0,
+      "reward": 0.4765625,
+      "reward_std": 0.2801200747489929,
+      "rewards/accuracy_reward/mean": 0.4765625,
+      "rewards/accuracy_reward/std": 0.5014128684997559,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999837875366211,
+      "sampling/importance_sampling_ratio/min": 3.763807762879878e-05,
+      "sampling/sampling_logp_difference/max": 10.187494277954102,
+      "sampling/sampling_logp_difference/mean": 0.017112664878368378,
+      "step": 189
+    },
+    {
+      "clip_ratio/high_max": 1.2461773394534248e-05,
+      "clip_ratio/high_mean": 3.115443348633562e-06,
+      "clip_ratio/low_mean": 5.095924211673264e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.4074685294835945e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15786.0,
+      "completions/mean_length": 7272.3203125,
+      "completions/mean_terminated_length": 7053.64013671875,
+      "completions/min_length": 1074.0,
+      "completions/min_terminated_length": 1074.0,
+      "entropy": 0.9627499282360077,
+      "epoch": 0.17479300827966882,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0022120666690170765,
+      "learning_rate": 1e-05,
+      "loss": 0.0079,
+      "num_tokens": 147737086.0,
+      "reward": 0.2890625,
+      "reward_std": 0.27304792404174805,
+      "rewards/accuracy_reward/mean": 0.2890625,
+      "rewards/accuracy_reward/std": 0.45510825514793396,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999538660049438,
+      "sampling/importance_sampling_ratio/min": 1.6960719221970066e-05,
+      "sampling/sampling_logp_difference/max": 10.984610557556152,
+      "sampling/sampling_logp_difference/mean": 0.0203307643532753,
+      "step": 190
+    },
+    {
+      "clip_ratio/high_max": 1.7891727566166082e-05,
+      "clip_ratio/high_mean": 4.472931891541521e-06,
+      "clip_ratio/low_mean": 5.616715043288423e-05,
+      "clip_ratio/low_min": 7.80031223257538e-06,
+      "clip_ratio/region_mean": 6.064008221073891e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16212.0,
+      "completions/mean_length": 6387.1875,
+      "completions/mean_terminated_length": 5895.54052734375,
+      "completions/min_length": 1310.0,
+      "completions/min_terminated_length": 1310.0,
+      "entropy": 0.9110158830881119,
+      "epoch": 0.17571297148114076,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0030851473566144705,
+      "learning_rate": 1e-05,
+      "loss": 0.1091,
+      "num_tokens": 148573782.0,
+      "reward": 0.40625,
+      "reward_std": 0.31246688961982727,
+      "rewards/accuracy_reward/mean": 0.40625,
+      "rewards/accuracy_reward/std": 0.4930621087551117,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.99997878074646,
+      "sampling/importance_sampling_ratio/min": 0.003961040172725916,
+      "sampling/sampling_logp_difference/max": 5.531248569488525,
+      "sampling/sampling_logp_difference/mean": 0.018049638718366623,
+      "step": 191
+    },
+    {
+      "clip_ratio/high_max": 1.6994396901282016e-05,
+      "clip_ratio/high_mean": 5.400205964178895e-06,
+      "clip_ratio/low_mean": 3.274822392995702e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.8148429439388565e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16176.0,
+      "completions/mean_length": 7267.59375,
+      "completions/mean_terminated_length": 7195.81103515625,
+      "completions/min_length": 653.0,
+      "completions/min_terminated_length": 653.0,
+      "entropy": 0.9254888147115707,
+      "epoch": 0.1766329346826127,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0020694085396826267,
+      "learning_rate": 1e-05,
+      "loss": 0.0462,
+      "num_tokens": 149521258.0,
+      "reward": 0.2734375,
+      "reward_std": 0.29719972610473633,
+      "rewards/accuracy_reward/mean": 0.2734375,
+      "rewards/accuracy_reward/std": 0.447474867105484,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999054670333862,
+      "sampling/importance_sampling_ratio/min": 7.411616934405174e-06,
+      "sampling/sampling_logp_difference/max": 11.812461853027344,
+      "sampling/sampling_logp_difference/mean": 0.01898832805454731,
+      "step": 192
+    },
+    {
+      "clip_ratio/high_max": 4.10414668294834e-06,
+      "clip_ratio/high_mean": 1.026036670737085e-06,
+      "clip_ratio/low_mean": 4.7441100377909606e-05,
+      "clip_ratio/low_min": 4.552241534838686e-06,
+      "clip_ratio/region_mean": 4.8467136821273016e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16076.0,
+      "completions/mean_length": 7100.1953125,
+      "completions/mean_terminated_length": 6952.83349609375,
+      "completions/min_length": 560.0,
+      "completions/min_terminated_length": 560.0,
+      "entropy": 0.8455610796809196,
+      "epoch": 0.17755289788408463,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.003085972974076867,
+      "learning_rate": 1e-05,
+      "loss": 0.0108,
+      "num_tokens": 150447923.0,
+      "reward": 0.25,
+      "reward_std": 0.23645778000354767,
+      "rewards/accuracy_reward/mean": 0.25,
+      "rewards/accuracy_reward/std": 0.434714138507843,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999178647994995,
+      "sampling/importance_sampling_ratio/min": 0.0011708807433024049,
+      "sampling/sampling_logp_difference/max": 6.749999046325684,
+      "sampling/sampling_logp_difference/mean": 0.01974140852689743,
+      "step": 193
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 1.6514521121280268e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 1.6514521121280268e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15535.0,
+      "completions/mean_length": 6626.4296875,
+      "completions/mean_terminated_length": 6549.5986328125,
+      "completions/min_length": 1746.0,
+      "completions/min_terminated_length": 1746.0,
+      "entropy": 1.0323699787259102,
+      "epoch": 0.17847286108555657,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.003505800850689411,
+      "learning_rate": 1e-05,
+      "loss": 0.0885,
+      "num_tokens": 151313834.0,
+      "reward": 0.390625,
+      "reward_std": 0.17176413536071777,
+      "rewards/accuracy_reward/mean": 0.390625,
+      "rewards/accuracy_reward/std": 0.4898075461387634,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999381303787231,
+      "sampling/importance_sampling_ratio/min": 2.8102756914449856e-05,
+      "sampling/sampling_logp_difference/max": 10.479642868041992,
+      "sampling/sampling_logp_difference/mean": 0.021082937717437744,
+      "step": 194
+    },
+    {
+      "clip_ratio/high_max": 2.006086378969485e-05,
+      "clip_ratio/high_mean": 5.890002398700744e-06,
+      "clip_ratio/low_mean": 3.503898199141986e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.092898473118112e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15595.0,
+      "completions/mean_length": 7093.109375,
+      "completions/mean_terminated_length": 6870.12841796875,
+      "completions/min_length": 69.0,
+      "completions/min_terminated_length": 69.0,
+      "entropy": 1.0206764563918114,
+      "epoch": 0.17939282428702852,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.002495395252481103,
+      "learning_rate": 1e-05,
+      "loss": 0.0308,
+      "num_tokens": 152238192.0,
+      "reward": 0.2890625,
+      "reward_std": 0.31800350546836853,
+      "rewards/accuracy_reward/mean": 0.2890625,
+      "rewards/accuracy_reward/std": 0.45510825514793396,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999728798866272,
+      "sampling/importance_sampling_ratio/min": 9.536534344078973e-05,
+      "sampling/sampling_logp_difference/max": 9.257795333862305,
+      "sampling/sampling_logp_difference/mean": 0.020610272884368896,
+      "step": 195
+    },
+    {
+      "clip_ratio/high_max": 3.2352409107261337e-06,
+      "clip_ratio/high_mean": 8.088102276815334e-07,
+      "clip_ratio/low_mean": 4.056704699451075e-05,
+      "clip_ratio/low_min": 1.1648833606159315e-05,
+      "clip_ratio/region_mean": 4.1375856994818605e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14191.0,
+      "completions/mean_length": 6795.71875,
+      "completions/mean_terminated_length": 6486.4189453125,
+      "completions/min_length": 424.0,
+      "completions/min_terminated_length": 424.0,
+      "entropy": 0.8927837759256363,
+      "epoch": 0.18031278748850046,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0014066790463402867,
+      "learning_rate": 1e-05,
+      "loss": -0.0031,
+      "num_tokens": 153131828.0,
+      "reward": 0.3359375,
+      "reward_std": 0.2409384548664093,
+      "rewards/accuracy_reward/mean": 0.3359375,
+      "rewards/accuracy_reward/std": 0.47417303919792175,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998855590820312,
+      "sampling/importance_sampling_ratio/min": 5.093755135021638e-06,
+      "sampling/sampling_logp_difference/max": 12.187495231628418,
+      "sampling/sampling_logp_difference/mean": 0.01874586008489132,
+      "step": 196
+    },
+    {
+      "clip_ratio/high_max": 1.5244630048982799e-05,
+      "clip_ratio/high_mean": 3.8111575122456998e-06,
+      "clip_ratio/low_mean": 3.655197178886738e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.03631290737394e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15831.0,
+      "completions/mean_length": 7075.1015625,
+      "completions/mean_terminated_length": 6617.28662109375,
+      "completions/min_length": 813.0,
+      "completions/min_terminated_length": 813.0,
+      "entropy": 0.8989318311214447,
+      "epoch": 0.1812327506899724,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0017937121447175741,
+      "learning_rate": 1e-05,
+      "loss": 0.0359,
+      "num_tokens": 154057097.0,
+      "reward": 0.3984375,
+      "reward_std": 0.23068872094154358,
+      "rewards/accuracy_reward/mean": 0.3984375,
+      "rewards/accuracy_reward/std": 0.4915000796318054,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998950958251953,
+      "sampling/importance_sampling_ratio/min": 0.00021659507183358073,
+      "sampling/sampling_logp_difference/max": 8.437480926513672,
+      "sampling/sampling_logp_difference/mean": 0.01890135183930397,
+      "step": 197
+    },
+    {
+      "clip_ratio/high_max": 1.4074375030759256e-05,
+      "clip_ratio/high_mean": 4.977033995601232e-06,
+      "clip_ratio/low_mean": 3.2670792506905855e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.764782627513341e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14100.0,
+      "completions/mean_length": 7120.0,
+      "completions/mean_terminated_length": 6743.41455078125,
+      "completions/min_length": 78.0,
+      "completions/min_terminated_length": 78.0,
+      "entropy": 0.8758384585380554,
+      "epoch": 0.18215271389144433,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.003410576842725277,
+      "learning_rate": 1e-05,
+      "loss": 0.0536,
+      "num_tokens": 154988585.0,
+      "reward": 0.3984375,
+      "reward_std": 0.2845958471298218,
+      "rewards/accuracy_reward/mean": 0.3984375,
+      "rewards/accuracy_reward/std": 0.4915000796318054,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999953508377075,
+      "sampling/importance_sampling_ratio/min": 0.003589102067053318,
+      "sampling/sampling_logp_difference/max": 5.629853248596191,
+      "sampling/sampling_logp_difference/mean": 0.018400676548480988,
+      "step": 198
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 1.977112736994968e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 1.977112736994968e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15821.0,
+      "completions/mean_length": 6590.6796875,
+      "completions/mean_terminated_length": 6513.56689453125,
+      "completions/min_length": 23.0,
+      "completions/min_terminated_length": 23.0,
+      "entropy": 0.9243742749094963,
+      "epoch": 0.18307267709291627,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.003304310142993927,
+      "learning_rate": 1e-05,
+      "loss": 0.0585,
+      "num_tokens": 155851000.0,
+      "reward": 0.3984375,
+      "reward_std": 0.29036980867385864,
+      "rewards/accuracy_reward/mean": 0.3984375,
+      "rewards/accuracy_reward/std": 0.4915000796318054,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999579787254333,
+      "sampling/importance_sampling_ratio/min": 1.2693599273916334e-06,
+      "sampling/sampling_logp_difference/max": 13.576997756958008,
+      "sampling/sampling_logp_difference/mean": 0.01959652081131935,
+      "step": 199
+    },
+    {
+      "clip_ratio/high_max": 1.1435367014200892e-05,
+      "clip_ratio/high_mean": 2.858841753550223e-06,
+      "clip_ratio/low_mean": 4.7742656533955596e-05,
+      "clip_ratio/low_min": 8.646529749967158e-06,
+      "clip_ratio/region_mean": 5.0601498060132144e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16115.0,
+      "completions/mean_length": 6999.484375,
+      "completions/mean_terminated_length": 6696.7578125,
+      "completions/min_length": 445.0,
+      "completions/min_terminated_length": 445.0,
+      "entropy": 0.843244343996048,
+      "epoch": 0.18399264029438822,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0023830258287489414,
+      "learning_rate": 1e-05,
+      "loss": 0.1142,
+      "num_tokens": 156766782.0,
+      "reward": 0.359375,
+      "reward_std": 0.2885475754737854,
+      "rewards/accuracy_reward/mean": 0.359375,
+      "rewards/accuracy_reward/std": 0.481702595949173,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998635053634644,
+      "sampling/importance_sampling_ratio/min": 0.00014761318743694574,
+      "sampling/sampling_logp_difference/max": 8.820915222167969,
+      "sampling/sampling_logp_difference/mean": 0.018434934318065643,
+      "step": 200
+    },
+    {
+      "clip_ratio/high_max": 2.5114631171163637e-05,
+      "clip_ratio/high_mean": 7.040741365926806e-06,
+      "clip_ratio/low_mean": 5.3607667723554187e-05,
+      "clip_ratio/low_min": 9.219345429301029e-06,
+      "clip_ratio/region_mean": 6.064840863473364e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14986.0,
+      "completions/mean_length": 6407.5,
+      "completions/mean_terminated_length": 6249.14306640625,
+      "completions/min_length": 351.0,
+      "completions/min_terminated_length": 351.0,
+      "entropy": 0.9549195989966393,
+      "epoch": 0.18491260349586017,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0024427250027656555,
+      "learning_rate": 1e-05,
+      "loss": 0.0795,
+      "num_tokens": 157606126.0,
+      "reward": 0.3515625,
+      "reward_std": 0.32879000902175903,
+      "rewards/accuracy_reward/mean": 0.3515625,
+      "rewards/accuracy_reward/std": 0.4793342351913452,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999966025352478,
+      "sampling/importance_sampling_ratio/min": 0.0002305622911080718,
+      "sampling/sampling_logp_difference/max": 8.37498950958252,
+      "sampling/sampling_logp_difference/mean": 0.0192743968218565,
+      "step": 201
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 2.928529067958152e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.928529067958152e-05,
+      "completions/clipped_ratio": 0.0703125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15519.0,
+      "completions/mean_length": 6638.390625,
+      "completions/mean_terminated_length": 5901.328125,
+      "completions/min_length": 56.0,
+      "completions/min_terminated_length": 56.0,
+      "entropy": 0.9070822075009346,
+      "epoch": 0.1858325666973321,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.002024515997618437,
+      "learning_rate": 1e-05,
+      "loss": 0.0604,
+      "num_tokens": 158474248.0,
+      "reward": 0.4140625,
+      "reward_std": 0.28117600083351135,
+      "rewards/accuracy_reward/mean": 0.4140625,
+      "rewards/accuracy_reward/std": 0.49449479579925537,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999830722808838,
+      "sampling/importance_sampling_ratio/min": 0.0036068728659301996,
+      "sampling/sampling_logp_difference/max": 5.624914169311523,
+      "sampling/sampling_logp_difference/mean": 0.01955476775765419,
+      "step": 202
+    },
+    {
+      "clip_ratio/high_max": 8.365173471247545e-06,
+      "clip_ratio/high_mean": 2.091293367811886e-06,
+      "clip_ratio/low_mean": 4.1470637825113954e-05,
+      "clip_ratio/low_min": 4.027710474474588e-06,
+      "clip_ratio/region_mean": 4.356193130661268e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15708.0,
+      "completions/mean_length": 7324.546875,
+      "completions/mean_terminated_length": 6878.99951171875,
+      "completions/min_length": 571.0,
+      "completions/min_terminated_length": 571.0,
+      "entropy": 0.9108889549970627,
+      "epoch": 0.18675252989880406,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0022787705529481173,
+      "learning_rate": 1e-05,
+      "loss": 0.0616,
+      "num_tokens": 159434350.0,
+      "reward": 0.3359375,
+      "reward_std": 0.26515230536460876,
+      "rewards/accuracy_reward/mean": 0.3359375,
+      "rewards/accuracy_reward/std": 0.47417303919792175,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999351501464844,
+      "sampling/importance_sampling_ratio/min": 0.03948089852929115,
+      "sampling/sampling_logp_difference/max": 3.231938362121582,
+      "sampling/sampling_logp_difference/mean": 0.019122496247291565,
+      "step": 203
+    },
+    {
+      "clip_ratio/high_max": 8.65733409227687e-06,
+      "clip_ratio/high_mean": 2.1643335230692173e-06,
+      "clip_ratio/low_mean": 3.456336048657249e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.672769389595487e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 13983.0,
+      "completions/mean_length": 5520.4453125,
+      "completions/mean_terminated_length": 5434.9052734375,
+      "completions/min_length": 73.0,
+      "completions/min_terminated_length": 73.0,
+      "entropy": 0.8982062339782715,
+      "epoch": 0.18767249310027598,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0026195270475000143,
+      "learning_rate": 1e-05,
+      "loss": 0.049,
+      "num_tokens": 160163055.0,
+      "reward": 0.4375,
+      "reward_std": 0.24831004440784454,
+      "rewards/accuracy_reward/mean": 0.4375,
+      "rewards/accuracy_reward/std": 0.49802759289741516,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998810291290283,
+      "sampling/importance_sampling_ratio/min": 0.0005541297141462564,
+      "sampling/sampling_logp_difference/max": 7.498111724853516,
+      "sampling/sampling_logp_difference/mean": 0.019064132124185562,
+      "step": 204
+    },
+    {
+      "clip_ratio/high_max": 1.8376186289970065e-05,
+      "clip_ratio/high_mean": 6.650576210631698e-06,
+      "clip_ratio/low_mean": 4.059042771586974e-05,
+      "clip_ratio/low_min": 5.350111223378917e-06,
+      "clip_ratio/region_mean": 4.724100449493562e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15267.0,
+      "completions/max_terminated_length": 15267.0,
+      "completions/mean_length": 6846.515625,
+      "completions/mean_terminated_length": 6846.515625,
+      "completions/min_length": 63.0,
+      "completions/min_terminated_length": 63.0,
+      "entropy": 0.9657742157578468,
+      "epoch": 0.18859245630174792,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0014831912703812122,
+      "learning_rate": 1e-05,
+      "loss": 0.006,
+      "num_tokens": 161057657.0,
+      "reward": 0.296875,
+      "reward_std": 0.27198708057403564,
+      "rewards/accuracy_reward/mean": 0.296875,
+      "rewards/accuracy_reward/std": 0.45867621898651123,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999252557754517,
+      "sampling/importance_sampling_ratio/min": 6.252834282349795e-05,
+      "sampling/sampling_logp_difference/max": 9.679890632629395,
+      "sampling/sampling_logp_difference/mean": 0.020372584462165833,
+      "step": 205
+    },
+    {
+      "clip_ratio/high_max": 1.658901419432368e-05,
+      "clip_ratio/high_mean": 4.14725354858092e-06,
+      "clip_ratio/low_mean": 4.473214539757464e-05,
+      "clip_ratio/low_min": 2.9674999950657366e-06,
+      "clip_ratio/region_mean": 4.887939894615556e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16370.0,
+      "completions/mean_length": 6946.8984375,
+      "completions/mean_terminated_length": 6642.4755859375,
+      "completions/min_length": 1133.0,
+      "completions/min_terminated_length": 1133.0,
+      "entropy": 0.8490508273243904,
+      "epoch": 0.18951241950321987,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0017962189158424735,
+      "learning_rate": 1e-05,
+      "loss": 0.0696,
+      "num_tokens": 161966356.0,
+      "reward": 0.4296875,
+      "reward_std": 0.33114415407180786,
+      "rewards/accuracy_reward/mean": 0.4296875,
+      "rewards/accuracy_reward/std": 0.4969765841960907,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999545216560364,
+      "sampling/importance_sampling_ratio/min": 7.035569433355704e-05,
+      "sampling/sampling_logp_difference/max": 9.561946868896484,
+      "sampling/sampling_logp_difference/mean": 0.019146796315908432,
+      "step": 206
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 3.22491199540309e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.22491199540309e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15123.0,
+      "completions/mean_length": 6618.9765625,
+      "completions/mean_terminated_length": 6463.9765625,
+      "completions/min_length": 529.0,
+      "completions/min_terminated_length": 529.0,
+      "entropy": 0.9541772454977036,
+      "epoch": 0.19043238270469182,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0017619321588426828,
+      "learning_rate": 1e-05,
+      "loss": 0.0509,
+      "num_tokens": 162836705.0,
+      "reward": 0.390625,
+      "reward_std": 0.2130674123764038,
+      "rewards/accuracy_reward/mean": 0.390625,
+      "rewards/accuracy_reward/std": 0.4898075461387634,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999436140060425,
+      "sampling/importance_sampling_ratio/min": 4.2106199771296815e-07,
+      "sampling/sampling_logp_difference/max": 14.680485725402832,
+      "sampling/sampling_logp_difference/mean": 0.020236656069755554,
+      "step": 207
+    },
+    {
+      "clip_ratio/high_max": 1.6846054222696694e-05,
+      "clip_ratio/high_mean": 4.211513555674173e-06,
+      "clip_ratio/low_mean": 3.877300162002939e-05,
+      "clip_ratio/low_min": 4.230834292684449e-06,
+      "clip_ratio/region_mean": 4.298451551676408e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 12469.0,
+      "completions/mean_length": 5485.71875,
+      "completions/mean_terminated_length": 5312.73046875,
+      "completions/min_length": 104.0,
+      "completions/min_terminated_length": 104.0,
+      "entropy": 0.8888534903526306,
+      "epoch": 0.19135234590616376,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.002670915797352791,
+      "learning_rate": 1e-05,
+      "loss": 0.0709,
+      "num_tokens": 163558197.0,
+      "reward": 0.46875,
+      "reward_std": 0.3145885467529297,
+      "rewards/accuracy_reward/mean": 0.46875,
+      "rewards/accuracy_reward/std": 0.5009832978248596,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000442266464233,
+      "sampling/importance_sampling_ratio/min": 0.0005042250850237906,
+      "sampling/sampling_logp_difference/max": 7.592487812042236,
+      "sampling/sampling_logp_difference/mean": 0.019581373780965805,
+      "step": 208
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 3.6889288480779214e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.6889288480779214e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16184.0,
+      "completions/mean_length": 4345.171875,
+      "completions/mean_terminated_length": 4250.3779296875,
+      "completions/min_length": 68.0,
+      "completions/min_terminated_length": 68.0,
+      "entropy": 0.8308270424604416,
+      "epoch": 0.1922723091076357,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.004005427472293377,
+      "learning_rate": 1e-05,
+      "loss": 0.1072,
+      "num_tokens": 164133499.0,
+      "reward": 0.578125,
+      "reward_std": 0.31642353534698486,
+      "rewards/accuracy_reward/mean": 0.578125,
+      "rewards/accuracy_reward/std": 0.4957992732524872,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999247193336487,
+      "sampling/importance_sampling_ratio/min": 0.022981969639658928,
+      "sampling/sampling_logp_difference/max": 3.773045301437378,
+      "sampling/sampling_logp_difference/mean": 0.017508968710899353,
+      "step": 209
+    },
+    {
+      "clip_ratio/high_max": 1.2997116300539346e-05,
+      "clip_ratio/high_mean": 3.2492790751348366e-06,
+      "clip_ratio/low_mean": 2.723402121773688e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.0483300406558556e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15509.0,
+      "completions/mean_length": 5227.296875,
+      "completions/mean_terminated_length": 5050.20654296875,
+      "completions/min_length": 5.0,
+      "completions/min_terminated_length": 5.0,
+      "entropy": 0.9231975972652435,
+      "epoch": 0.19319227230910763,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0031033784616738558,
+      "learning_rate": 1e-05,
+      "loss": 0.0179,
+      "num_tokens": 164823681.0,
+      "reward": 0.4765625,
+      "reward_std": 0.29249146580696106,
+      "rewards/accuracy_reward/mean": 0.4765625,
+      "rewards/accuracy_reward/std": 0.5014128684997559,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999896764755249,
+      "sampling/importance_sampling_ratio/min": 0.0021342060063034296,
+      "sampling/sampling_logp_difference/max": 6.149660587310791,
+      "sampling/sampling_logp_difference/mean": 0.019171088933944702,
+      "step": 210
+    },
+    {
+      "clip_ratio/high_max": 2.0835890609305352e-05,
+      "clip_ratio/high_mean": 5.208972652326338e-06,
+      "clip_ratio/low_mean": 2.9314877565411734e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.452385044511175e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14160.0,
+      "completions/mean_length": 6473.4765625,
+      "completions/mean_terminated_length": 6316.1669921875,
+      "completions/min_length": 726.0,
+      "completions/min_terminated_length": 726.0,
+      "entropy": 0.9061874598264694,
+      "epoch": 0.19411223551057957,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.003495733719319105,
+      "learning_rate": 1e-05,
+      "loss": 0.0785,
+      "num_tokens": 165668798.0,
+      "reward": 0.4765625,
+      "reward_std": 0.3469353914260864,
+      "rewards/accuracy_reward/mean": 0.4765625,
+      "rewards/accuracy_reward/std": 0.5014128684997559,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000354051589966,
+      "sampling/importance_sampling_ratio/min": 0.0004697878030128777,
+      "sampling/sampling_logp_difference/max": 7.663229465484619,
+      "sampling/sampling_logp_difference/mean": 0.018978482112288475,
+      "step": 211
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 3.991967162164656e-05,
+      "clip_ratio/low_min": 6.304534053924726e-06,
+      "clip_ratio/region_mean": 3.991967162164656e-05,
+      "completions/clipped_ratio": 0.0546875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14659.0,
+      "completions/mean_length": 7140.1953125,
+      "completions/mean_terminated_length": 6605.4296875,
+      "completions/min_length": 141.0,
+      "completions/min_terminated_length": 141.0,
+      "entropy": 0.9605444446206093,
+      "epoch": 0.19503219871205152,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.002381941769272089,
+      "learning_rate": 1e-05,
+      "loss": 0.027,
+      "num_tokens": 166603375.0,
+      "reward": 0.3046875,
+      "reward_std": 0.27776598930358887,
+      "rewards/accuracy_reward/mean": 0.3046875,
+      "rewards/accuracy_reward/std": 0.46208351850509644,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999864935874939,
+      "sampling/importance_sampling_ratio/min": 0.00043123820796608925,
+      "sampling/sampling_logp_difference/max": 7.748849868774414,
+      "sampling/sampling_logp_difference/mean": 0.021141134202480316,
+      "step": 212
+    },
+    {
+      "clip_ratio/high_max": 1.4948576790629886e-05,
+      "clip_ratio/high_mean": 3.7371441976574715e-06,
+      "clip_ratio/low_mean": 3.4953729482367635e-05,
+      "clip_ratio/low_min": 3.991060111729894e-06,
+      "clip_ratio/region_mean": 3.869087413477246e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 13770.0,
+      "completions/mean_length": 5304.46875,
+      "completions/mean_terminated_length": 5038.56005859375,
+      "completions/min_length": 342.0,
+      "completions/min_terminated_length": 342.0,
+      "entropy": 0.9176690131425858,
+      "epoch": 0.19595216191352346,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0040566748939454556,
+      "learning_rate": 1e-05,
+      "loss": 0.0534,
+      "num_tokens": 167302275.0,
+      "reward": 0.4296875,
+      "reward_std": 0.33114415407180786,
+      "rewards/accuracy_reward/mean": 0.4296875,
+      "rewards/accuracy_reward/std": 0.4969765841960907,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999827742576599,
+      "sampling/importance_sampling_ratio/min": 5.001809313398553e-07,
+      "sampling/sampling_logp_difference/max": 14.508296012878418,
+      "sampling/sampling_logp_difference/mean": 0.018822530284523964,
+      "step": 213
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 2.653866999935417e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.653866999935417e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15791.0,
+      "completions/mean_length": 5796.5,
+      "completions/mean_terminated_length": 5542.400390625,
+      "completions/min_length": 407.0,
+      "completions/min_terminated_length": 407.0,
+      "entropy": 0.9230027198791504,
+      "epoch": 0.1968721251149954,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0021502040326595306,
+      "learning_rate": 1e-05,
+      "loss": 0.0737,
+      "num_tokens": 168063627.0,
+      "reward": 0.3828125,
+      "reward_std": 0.3158818483352661,
+      "rewards/accuracy_reward/mean": 0.3828125,
+      "rewards/accuracy_reward/std": 0.4879830479621887,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999223351478577,
+      "sampling/importance_sampling_ratio/min": 0.009504453279078007,
+      "sampling/sampling_logp_difference/max": 4.655994892120361,
+      "sampling/sampling_logp_difference/mean": 0.01985779032111168,
+      "step": 214
+    },
+    {
+      "clip_ratio/high_max": 1.0863841453101486e-05,
+      "clip_ratio/high_mean": 2.7159603632753715e-06,
+      "clip_ratio/low_mean": 2.4175752741939505e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.6891713218901714e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14814.0,
+      "completions/mean_length": 6135.4921875,
+      "completions/mean_terminated_length": 6054.79541015625,
+      "completions/min_length": 1259.0,
+      "completions/min_terminated_length": 1259.0,
+      "entropy": 0.869445689022541,
+      "epoch": 0.19779208831646733,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0027786416467279196,
+      "learning_rate": 1e-05,
+      "loss": 0.0139,
+      "num_tokens": 168867858.0,
+      "reward": 0.4609375,
+      "reward_std": 0.3366856575012207,
+      "rewards/accuracy_reward/mean": 0.4609375,
+      "rewards/accuracy_reward/std": 0.5004304051399231,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999550580978394,
+      "sampling/importance_sampling_ratio/min": 2.6089865059475414e-05,
+      "sampling/sampling_logp_difference/max": 10.553963661193848,
+      "sampling/sampling_logp_difference/mean": 0.018514130264520645,
+      "step": 215
+    },
+    {
+      "clip_ratio/high_max": 4.36788013757905e-06,
+      "clip_ratio/high_mean": 1.0919700343947625e-06,
+      "clip_ratio/low_mean": 1.993327998661698e-06,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.0852980330564606e-06,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15691.0,
+      "completions/mean_length": 6268.2421875,
+      "completions/mean_terminated_length": 6025.46435546875,
+      "completions/min_length": 627.0,
+      "completions/min_terminated_length": 627.0,
+      "entropy": 0.951081782579422,
+      "epoch": 0.19871205151793928,
+      "frac_reward_zero_std": 0.75,
+      "grad_norm": 0.0007328780484385788,
+      "learning_rate": 1e-05,
+      "loss": 0.0188,
+      "num_tokens": 169689969.0,
+      "reward": 0.3828125,
+      "reward_std": 0.10994865000247955,
+      "rewards/accuracy_reward/mean": 0.3828125,
+      "rewards/accuracy_reward/std": 0.4879830479621887,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000133514404297,
+      "sampling/importance_sampling_ratio/min": 1.6650999896228313e-05,
+      "sampling/sampling_logp_difference/max": 11.003040313720703,
+      "sampling/sampling_logp_difference/mean": 0.02005261555314064,
+      "step": 216
+    },
+    {
+      "clip_ratio/high_max": 2.131336282218399e-05,
+      "clip_ratio/high_mean": 5.3283407055459975e-06,
+      "clip_ratio/low_mean": 3.5254403428552905e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.058274430462916e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 13861.0,
+      "completions/mean_length": 5440.8984375,
+      "completions/mean_terminated_length": 5354.732421875,
+      "completions/min_length": 413.0,
+      "completions/min_terminated_length": 413.0,
+      "entropy": 0.8271932750940323,
+      "epoch": 0.19963201471941122,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0034721922129392624,
+      "learning_rate": 1e-05,
+      "loss": -0.0245,
+      "num_tokens": 170409292.0,
+      "reward": 0.53125,
+      "reward_std": 0.30327308177948,
+      "rewards/accuracy_reward/mean": 0.53125,
+      "rewards/accuracy_reward/std": 0.5009832978248596,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998912811279297,
+      "sampling/importance_sampling_ratio/min": 1.8372484191786498e-05,
+      "sampling/sampling_logp_difference/max": 10.904656410217285,
+      "sampling/sampling_logp_difference/mean": 0.019136395305395126,
+      "step": 217
+    },
+    {
+      "clip_ratio/high_max": 1.2339016848272877e-05,
+      "clip_ratio/high_mean": 4.13687178024702e-06,
+      "clip_ratio/low_mean": 2.156280152121326e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.569967330146028e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15086.0,
+      "completions/mean_length": 6671.046875,
+      "completions/mean_terminated_length": 6594.56689453125,
+      "completions/min_length": 748.0,
+      "completions/min_terminated_length": 748.0,
+      "entropy": 0.9659745842218399,
+      "epoch": 0.20055197792088317,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0027575206477195024,
+      "learning_rate": 1e-05,
+      "loss": 0.0286,
+      "num_tokens": 171280714.0,
+      "reward": 0.375,
+      "reward_std": 0.2109457552433014,
+      "rewards/accuracy_reward/mean": 0.375,
+      "rewards/accuracy_reward/std": 0.4860251843929291,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999411702156067,
+      "sampling/importance_sampling_ratio/min": 1.5700872609158978e-05,
+      "sampling/sampling_logp_difference/max": 11.06179428100586,
+      "sampling/sampling_logp_difference/mean": 0.019089506939053535,
+      "step": 218
+    },
+    {
+      "clip_ratio/high_max": 1.4603458112105727e-05,
+      "clip_ratio/high_mean": 3.650864528026432e-06,
+      "clip_ratio/low_mean": 3.2977761520669446e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.662862599185246e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15752.0,
+      "completions/mean_length": 7781.5546875,
+      "completions/mean_terminated_length": 7504.05615234375,
+      "completions/min_length": 429.0,
+      "completions/min_terminated_length": 429.0,
+      "entropy": 1.1691131889820099,
+      "epoch": 0.2014719411223551,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.0012711051385849714,
+      "learning_rate": 1e-05,
+      "loss": 0.0115,
+      "num_tokens": 172302489.0,
+      "reward": 0.109375,
+      "reward_std": 0.1751839816570282,
+      "rewards/accuracy_reward/mean": 0.109375,
+      "rewards/accuracy_reward/std": 0.31333550810813904,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998820424079895,
+      "sampling/importance_sampling_ratio/min": 0.005086081102490425,
+      "sampling/sampling_logp_difference/max": 5.281247615814209,
+      "sampling/sampling_logp_difference/mean": 0.023309212177991867,
+      "step": 219
+    },
+    {
+      "clip_ratio/high_max": 6.842087486802484e-06,
+      "clip_ratio/high_mean": 1.710521871700621e-06,
+      "clip_ratio/low_mean": 4.5269940528669395e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.6980462457213434e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14891.0,
+      "completions/mean_length": 6489.96875,
+      "completions/mean_terminated_length": 6332.9208984375,
+      "completions/min_length": 556.0,
+      "completions/min_terminated_length": 556.0,
+      "entropy": 0.9354017227888107,
+      "epoch": 0.20239190432382706,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0016933141741901636,
+      "learning_rate": 1e-05,
+      "loss": 0.0156,
+      "num_tokens": 173149653.0,
+      "reward": 0.484375,
+      "reward_std": 0.32325342297554016,
+      "rewards/accuracy_reward/mean": 0.484375,
+      "rewards/accuracy_reward/std": 0.5017194747924805,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999572038650513,
+      "sampling/importance_sampling_ratio/min": 0.008998609147965908,
+      "sampling/sampling_logp_difference/max": 4.7106852531433105,
+      "sampling/sampling_logp_difference/mean": 0.019165027886629105,
+      "step": 220
+    },
+    {
+      "clip_ratio/high_max": 2.444740721330163e-05,
+      "clip_ratio/high_mean": 6.111851803325408e-06,
+      "clip_ratio/low_mean": 3.0998270403870265e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.711012095664046e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 14943.0,
+      "completions/max_terminated_length": 14943.0,
+      "completions/mean_length": 6309.75,
+      "completions/mean_terminated_length": 6309.75,
+      "completions/min_length": 474.0,
+      "completions/min_terminated_length": 474.0,
+      "entropy": 1.012483686208725,
+      "epoch": 0.20331186752529898,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0024940327275544405,
+      "learning_rate": 1e-05,
+      "loss": 0.0552,
+      "num_tokens": 173976797.0,
+      "reward": 0.4375,
+      "reward_std": 0.2790592610836029,
+      "rewards/accuracy_reward/mean": 0.4375,
+      "rewards/accuracy_reward/std": 0.49802759289741516,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999861121177673,
+      "sampling/importance_sampling_ratio/min": 0.0018720829393714666,
+      "sampling/sampling_logp_difference/max": 6.280703544616699,
+      "sampling/sampling_logp_difference/mean": 0.020797956734895706,
+      "step": 221
+    },
+    {
+      "clip_ratio/high_max": 1.1112337460872368e-05,
+      "clip_ratio/high_mean": 3.5388877677178243e-06,
+      "clip_ratio/low_mean": 1.7024583712554886e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.056347148027271e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16362.0,
+      "completions/mean_length": 7574.984375,
+      "completions/mean_terminated_length": 7363.568359375,
+      "completions/min_length": 598.0,
+      "completions/min_terminated_length": 598.0,
+      "entropy": 0.9144782647490501,
+      "epoch": 0.20423183072677092,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.002748408354818821,
+      "learning_rate": 1e-05,
+      "loss": 0.0588,
+      "num_tokens": 174965259.0,
+      "reward": 0.2734375,
+      "reward_std": 0.25224411487579346,
+      "rewards/accuracy_reward/mean": 0.2734375,
+      "rewards/accuracy_reward/std": 0.447474867105484,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000108480453491,
+      "sampling/importance_sampling_ratio/min": 0.005681300535798073,
+      "sampling/sampling_logp_difference/max": 5.170575141906738,
+      "sampling/sampling_logp_difference/mean": 0.019229793921113014,
+      "step": 222
+    },
+    {
+      "clip_ratio/high_max": 1.4946090004741563e-05,
+      "clip_ratio/high_mean": 3.736522501185391e-06,
+      "clip_ratio/low_mean": 3.722507381098694e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.096159636901575e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15365.0,
+      "completions/mean_length": 6962.7734375,
+      "completions/mean_terminated_length": 6499.43408203125,
+      "completions/min_length": 780.0,
+      "completions/min_terminated_length": 780.0,
+      "entropy": 0.9248140156269073,
+      "epoch": 0.20515179392824287,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0020343128126114607,
+      "learning_rate": 1e-05,
+      "loss": 0.0714,
+      "num_tokens": 175876446.0,
+      "reward": 0.421875,
+      "reward_std": 0.3156445026397705,
+      "rewards/accuracy_reward/mean": 0.421875,
+      "rewards/accuracy_reward/std": 0.4957992732524872,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999679327011108,
+      "sampling/importance_sampling_ratio/min": 0.0001609467581147328,
+      "sampling/sampling_logp_difference/max": 8.734436988830566,
+      "sampling/sampling_logp_difference/mean": 0.01860032044351101,
+      "step": 223
+    },
+    {
+      "clip_ratio/high_max": 4.226114015182247e-06,
+      "clip_ratio/high_mean": 1.0565285037955618e-06,
+      "clip_ratio/low_mean": 3.189400638348161e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.295053488727717e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14978.0,
+      "completions/mean_length": 6422.28125,
+      "completions/mean_terminated_length": 6264.1591796875,
+      "completions/min_length": 117.0,
+      "completions/min_terminated_length": 117.0,
+      "entropy": 0.7786787301301956,
+      "epoch": 0.20607175712971482,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0029119597747921944,
+      "learning_rate": 1e-05,
+      "loss": 0.1116,
+      "num_tokens": 176717226.0,
+      "reward": 0.578125,
+      "reward_std": 0.27328526973724365,
+      "rewards/accuracy_reward/mean": 0.578125,
+      "rewards/accuracy_reward/std": 0.4957992732524872,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999918937683105,
+      "sampling/importance_sampling_ratio/min": 0.0006287595024332404,
+      "sampling/sampling_logp_difference/max": 7.371761798858643,
+      "sampling/sampling_logp_difference/mean": 0.01786171644926071,
+      "step": 224
+    },
+    {
+      "clip_ratio/high_max": 5.4112551879370585e-06,
+      "clip_ratio/high_mean": 1.3528137969842646e-06,
+      "clip_ratio/low_mean": 2.103693077515345e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.2389744572137715e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16030.0,
+      "completions/mean_length": 6662.65625,
+      "completions/mean_terminated_length": 6508.349609375,
+      "completions/min_length": 486.0,
+      "completions/min_terminated_length": 486.0,
+      "entropy": 0.9501350447535515,
+      "epoch": 0.20699172033118676,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.0027519147843122482,
+      "learning_rate": 1e-05,
+      "loss": 0.0204,
+      "num_tokens": 177586766.0,
+      "reward": 0.421875,
+      "reward_std": 0.21382881700992584,
+      "rewards/accuracy_reward/mean": 0.421875,
+      "rewards/accuracy_reward/std": 0.4957992732524872,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000051259994507,
+      "sampling/importance_sampling_ratio/min": 2.507045428501442e-05,
+      "sampling/sampling_logp_difference/max": 10.593820571899414,
+      "sampling/sampling_logp_difference/mean": 0.020679686218500137,
+      "step": 225
+    },
+    {
+      "clip_ratio/high_max": 3.2487785119883483e-06,
+      "clip_ratio/high_mean": 8.121946279970871e-07,
+      "clip_ratio/low_mean": 5.783435085504607e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.8646545539886574e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15419.0,
+      "completions/mean_length": 6546.171875,
+      "completions/mean_terminated_length": 6146.259765625,
+      "completions/min_length": 839.0,
+      "completions/min_terminated_length": 839.0,
+      "entropy": 0.9217342138290405,
+      "epoch": 0.20791168353265868,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0017936143558472395,
+      "learning_rate": 1e-05,
+      "loss": 0.0748,
+      "num_tokens": 178444556.0,
+      "reward": 0.3984375,
+      "reward_std": 0.2909066081047058,
+      "rewards/accuracy_reward/mean": 0.3984375,
+      "rewards/accuracy_reward/std": 0.4915000796318054,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000327825546265,
+      "sampling/importance_sampling_ratio/min": 8.447741129202768e-05,
+      "sampling/sampling_logp_difference/max": 9.379026412963867,
+      "sampling/sampling_logp_difference/mean": 0.019764548167586327,
+      "step": 226
+    },
+    {
+      "clip_ratio/high_max": 2.1980493102091714e-05,
+      "clip_ratio/high_mean": 5.4951232755229285e-06,
+      "clip_ratio/low_mean": 4.3977801396977156e-05,
+      "clip_ratio/low_min": 7.912247156127705e-06,
+      "clip_ratio/region_mean": 4.947292427459615e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15707.0,
+      "completions/max_terminated_length": 15707.0,
+      "completions/mean_length": 6433.9296875,
+      "completions/mean_terminated_length": 6433.9296875,
+      "completions/min_length": 731.0,
+      "completions/min_terminated_length": 731.0,
+      "entropy": 0.9361409991979599,
+      "epoch": 0.20883164673413063,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0031324021983891726,
+      "learning_rate": 1e-05,
+      "loss": 0.0505,
+      "num_tokens": 179288499.0,
+      "reward": 0.453125,
+      "reward_std": 0.3066929578781128,
+      "rewards/accuracy_reward/mean": 0.453125,
+      "rewards/accuracy_reward/std": 0.4997538626194,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999454021453857,
+      "sampling/importance_sampling_ratio/min": 0.00018488657951820642,
+      "sampling/sampling_logp_difference/max": 8.595767974853516,
+      "sampling/sampling_logp_difference/mean": 0.019691072404384613,
+      "step": 227
+    },
+    {
+      "clip_ratio/high_max": 1.299416817346355e-05,
+      "clip_ratio/high_mean": 3.2485420433658874e-06,
+      "clip_ratio/low_mean": 3.756406420052372e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.081260635757644e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15787.0,
+      "completions/mean_length": 6037.75,
+      "completions/mean_terminated_length": 5873.52392578125,
+      "completions/min_length": 551.0,
+      "completions/min_terminated_length": 551.0,
+      "entropy": 0.8700985535979271,
+      "epoch": 0.20975160993560257,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0024714914616197348,
+      "learning_rate": 1e-05,
+      "loss": 0.0044,
+      "num_tokens": 180079619.0,
+      "reward": 0.484375,
+      "reward_std": 0.21436560153961182,
+      "rewards/accuracy_reward/mean": 0.484375,
+      "rewards/accuracy_reward/std": 0.5017194747924805,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999628067016602,
+      "sampling/importance_sampling_ratio/min": 8.4841696661897e-05,
+      "sampling/sampling_logp_difference/max": 9.374723434448242,
+      "sampling/sampling_logp_difference/mean": 0.018519341945648193,
+      "step": 228
+    },
+    {
+      "clip_ratio/high_max": 7.293307589861797e-06,
+      "clip_ratio/high_mean": 1.8233268974654493e-06,
+      "clip_ratio/low_mean": 2.2305866423266707e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.412919320704532e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 12264.0,
+      "completions/max_terminated_length": 12264.0,
+      "completions/mean_length": 5305.828125,
+      "completions/mean_terminated_length": 5305.828125,
+      "completions/min_length": 229.0,
+      "completions/min_terminated_length": 229.0,
+      "entropy": 1.1309608668088913,
+      "epoch": 0.21067157313707452,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.003593914210796356,
+      "learning_rate": 1e-05,
+      "loss": 0.0478,
+      "num_tokens": 180780877.0,
+      "reward": 0.3984375,
+      "reward_std": 0.24671241641044617,
+      "rewards/accuracy_reward/mean": 0.3984375,
+      "rewards/accuracy_reward/std": 0.4915000796318054,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.000011920928955,
+      "sampling/importance_sampling_ratio/min": 0.009941472671926022,
+      "sampling/sampling_logp_difference/max": 4.611040115356445,
+      "sampling/sampling_logp_difference/mean": 0.020471621304750443,
+      "step": 229
+    },
+    {
+      "clip_ratio/high_max": 2.0163415001661633e-05,
+      "clip_ratio/high_mean": 5.040853750415408e-06,
+      "clip_ratio/low_mean": 4.4980357415624894e-05,
+      "clip_ratio/low_min": 1.0012816346716136e-05,
+      "clip_ratio/region_mean": 5.0021211109196884e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 13814.0,
+      "completions/mean_length": 6022.96875,
+      "completions/mean_terminated_length": 5774.30419921875,
+      "completions/min_length": 556.0,
+      "completions/min_terminated_length": 556.0,
+      "entropy": 0.8560900762677193,
+      "epoch": 0.21159153633854647,
+      "frac_reward_zero_std": 0.125,
+      "grad_norm": 0.0029816587921231985,
+      "learning_rate": 1e-05,
+      "loss": 0.0913,
+      "num_tokens": 181571465.0,
+      "reward": 0.515625,
+      "reward_std": 0.41504397988319397,
+      "rewards/accuracy_reward/mean": 0.515625,
+      "rewards/accuracy_reward/std": 0.5017194747924805,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999518394470215,
+      "sampling/importance_sampling_ratio/min": 1.5958334188326262e-05,
+      "sampling/sampling_logp_difference/max": 11.04552936553955,
+      "sampling/sampling_logp_difference/mean": 0.0181986466050148,
+      "step": 230
+    },
+    {
+      "clip_ratio/high_max": 1.8430865566188004e-05,
+      "clip_ratio/high_mean": 6.177042905619601e-06,
+      "clip_ratio/low_mean": 4.450247388376738e-05,
+      "clip_ratio/low_min": 4.840271230932558e-06,
+      "clip_ratio/region_mean": 5.067951724413433e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15130.0,
+      "completions/max_terminated_length": 15130.0,
+      "completions/mean_length": 6647.71875,
+      "completions/mean_terminated_length": 6647.71875,
+      "completions/min_length": 196.0,
+      "completions/min_terminated_length": 196.0,
+      "entropy": 0.9455481320619583,
+      "epoch": 0.2125114995400184,
+      "frac_reward_zero_std": 0.125,
+      "grad_norm": 0.0031632622703909874,
+      "learning_rate": 1e-05,
+      "loss": 0.1317,
+      "num_tokens": 182440957.0,
+      "reward": 0.3828125,
+      "reward_std": 0.39902517199516296,
+      "rewards/accuracy_reward/mean": 0.3828125,
+      "rewards/accuracy_reward/std": 0.4879830479621887,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000306367874146,
+      "sampling/importance_sampling_ratio/min": 1.4739508515049238e-05,
+      "sampling/sampling_logp_difference/max": 11.124979019165039,
+      "sampling/sampling_logp_difference/mean": 0.01906408555805683,
+      "step": 231
+    },
+    {
+      "clip_ratio/high_max": 2.2937053017813014e-05,
+      "clip_ratio/high_mean": 5.7342632544532535e-06,
+      "clip_ratio/low_mean": 6.042617155799235e-05,
+      "clip_ratio/low_min": 1.1000354334100848e-05,
+      "clip_ratio/region_mean": 6.616043401663774e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15988.0,
+      "completions/mean_length": 6809.1640625,
+      "completions/mean_terminated_length": 6500.29833984375,
+      "completions/min_length": 471.0,
+      "completions/min_terminated_length": 471.0,
+      "entropy": 1.050546184182167,
+      "epoch": 0.21343146274149033,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.00162694591563195,
+      "learning_rate": 1e-05,
+      "loss": 0.0346,
+      "num_tokens": 183332242.0,
+      "reward": 0.421875,
+      "reward_std": 0.33616161346435547,
+      "rewards/accuracy_reward/mean": 0.421875,
+      "rewards/accuracy_reward/std": 0.4957992732524872,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000290870666504,
+      "sampling/importance_sampling_ratio/min": 4.244970114086755e-06,
+      "sampling/sampling_logp_difference/max": 12.369775772094727,
+      "sampling/sampling_logp_difference/mean": 0.021866722032427788,
+      "step": 232
+    },
+    {
+      "clip_ratio/high_max": 1.4678411844215589e-05,
+      "clip_ratio/high_mean": 3.669602961053897e-06,
+      "clip_ratio/low_mean": 2.4373607971028832e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.8043211159456405e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16348.0,
+      "completions/mean_length": 6815.5,
+      "completions/mean_terminated_length": 6506.83837890625,
+      "completions/min_length": 51.0,
+      "completions/min_terminated_length": 51.0,
+      "entropy": 1.060033954679966,
+      "epoch": 0.21435142594296228,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0024887355975806713,
+      "learning_rate": 1e-05,
+      "loss": 0.1059,
+      "num_tokens": 184225138.0,
+      "reward": 0.328125,
+      "reward_std": 0.2869548499584198,
+      "rewards/accuracy_reward/mean": 0.328125,
+      "rewards/accuracy_reward/std": 0.4713755249977112,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999393820762634,
+      "sampling/importance_sampling_ratio/min": 0.00012930770753882825,
+      "sampling/sampling_logp_difference/max": 8.953315734863281,
+      "sampling/sampling_logp_difference/mean": 0.02019432932138443,
+      "step": 233
+    },
+    {
+      "clip_ratio/high_max": 7.910891326901037e-06,
+      "clip_ratio/high_mean": 1.9777228317252593e-06,
+      "clip_ratio/low_mean": 3.8802519611635944e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.078024221598753e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15838.0,
+      "completions/mean_length": 6928.4453125,
+      "completions/mean_terminated_length": 6623.42724609375,
+      "completions/min_length": 304.0,
+      "completions/min_terminated_length": 304.0,
+      "entropy": 0.9051575735211372,
+      "epoch": 0.21527138914443422,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.002783838426694274,
+      "learning_rate": 1e-05,
+      "loss": 0.0624,
+      "num_tokens": 185136323.0,
+      "reward": 0.3359375,
+      "reward_std": 0.25460803508758545,
+      "rewards/accuracy_reward/mean": 0.3359375,
+      "rewards/accuracy_reward/std": 0.47417303919792175,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999524354934692,
+      "sampling/importance_sampling_ratio/min": 1.0146355634788051e-05,
+      "sampling/sampling_logp_difference/max": 11.498395919799805,
+      "sampling/sampling_logp_difference/mean": 0.01905050128698349,
+      "step": 234
+    },
+    {
+      "clip_ratio/high_max": 4.399394583742833e-06,
+      "clip_ratio/high_mean": 1.0998486459357082e-06,
+      "clip_ratio/low_mean": 1.733424267058581e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 1.8434091430208355e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14703.0,
+      "completions/mean_length": 7155.1328125,
+      "completions/mean_terminated_length": 7082.46435546875,
+      "completions/min_length": 111.0,
+      "completions/min_terminated_length": 111.0,
+      "entropy": 1.0119014978408813,
+      "epoch": 0.21619135234590617,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.002105508930981159,
+      "learning_rate": 1e-05,
+      "loss": 0.0655,
+      "num_tokens": 186071324.0,
+      "reward": 0.328125,
+      "reward_std": 0.26303553581237793,
+      "rewards/accuracy_reward/mean": 0.328125,
+      "rewards/accuracy_reward/std": 0.4713755249977112,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999904990196228,
+      "sampling/importance_sampling_ratio/min": 0.003494206117466092,
+      "sampling/sampling_logp_difference/max": 5.656649112701416,
+      "sampling/sampling_logp_difference/mean": 0.020860780030488968,
+      "step": 235
+    },
+    {
+      "clip_ratio/high_max": 1.0561529961705673e-05,
+      "clip_ratio/high_mean": 3.4390433256703545e-06,
+      "clip_ratio/low_mean": 2.8499469067355676e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.193851205196552e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 16176.0,
+      "completions/max_terminated_length": 16176.0,
+      "completions/mean_length": 7463.2421875,
+      "completions/mean_terminated_length": 7463.2421875,
+      "completions/min_length": 698.0,
+      "completions/min_terminated_length": 698.0,
+      "entropy": 0.9983502700924873,
+      "epoch": 0.21711131554737811,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0013582308311015368,
+      "learning_rate": 1e-05,
+      "loss": 0.048,
+      "num_tokens": 187045035.0,
+      "reward": 0.3984375,
+      "reward_std": 0.2517249584197998,
+      "rewards/accuracy_reward/mean": 0.3984375,
+      "rewards/accuracy_reward/std": 0.4915000796318054,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999428987503052,
+      "sampling/importance_sampling_ratio/min": 0.000473080639494583,
+      "sampling/sampling_logp_difference/max": 7.65624475479126,
+      "sampling/sampling_logp_difference/mean": 0.021131811663508415,
+      "step": 236
+    },
+    {
+      "clip_ratio/high_max": 8.509013468938065e-06,
+      "clip_ratio/high_mean": 2.127253367234516e-06,
+      "clip_ratio/low_mean": 3.985050443588989e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.197775751890731e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14938.0,
+      "completions/mean_length": 6460.984375,
+      "completions/mean_terminated_length": 6382.8505859375,
+      "completions/min_length": 1747.0,
+      "completions/min_terminated_length": 1747.0,
+      "entropy": 0.7869217246770859,
+      "epoch": 0.21803127874885003,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.002681629965081811,
+      "learning_rate": 1e-05,
+      "loss": 0.0987,
+      "num_tokens": 187889609.0,
+      "reward": 0.5234375,
+      "reward_std": 0.39082521200180054,
+      "rewards/accuracy_reward/mean": 0.5234375,
+      "rewards/accuracy_reward/std": 0.5014128684997559,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999568462371826,
+      "sampling/importance_sampling_ratio/min": 0.0015037209959700704,
+      "sampling/sampling_logp_difference/max": 6.499812602996826,
+      "sampling/sampling_logp_difference/mean": 0.016937749460339546,
+      "step": 237
+    },
+    {
+      "clip_ratio/high_max": 1.2362176221358823e-05,
+      "clip_ratio/high_mean": 3.0905440553397057e-06,
+      "clip_ratio/low_mean": 5.0333514764133724e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.342405825103924e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15893.0,
+      "completions/mean_length": 6241.78125,
+      "completions/mean_terminated_length": 6161.92138671875,
+      "completions/min_length": 548.0,
+      "completions/min_terminated_length": 548.0,
+      "entropy": 1.0217387825250626,
+      "epoch": 0.21895124195032198,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0021239183843135834,
+      "learning_rate": 1e-05,
+      "loss": 0.0353,
+      "num_tokens": 188706605.0,
+      "reward": 0.2578125,
+      "reward_std": 0.3135277330875397,
+      "rewards/accuracy_reward/mean": 0.2578125,
+      "rewards/accuracy_reward/std": 0.43914902210235596,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999796748161316,
+      "sampling/importance_sampling_ratio/min": 0.004853047896176577,
+      "sampling/sampling_logp_difference/max": 5.328148365020752,
+      "sampling/sampling_logp_difference/mean": 0.02103862166404724,
+      "step": 238
+    },
+    {
+      "clip_ratio/high_max": 6.725130333506968e-06,
+      "clip_ratio/high_mean": 1.681282583376742e-06,
+      "clip_ratio/low_mean": 3.437372129155847e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.605500387493521e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15332.0,
+      "completions/mean_length": 5638.1328125,
+      "completions/mean_terminated_length": 5553.51953125,
+      "completions/min_length": 66.0,
+      "completions/min_terminated_length": 66.0,
+      "entropy": 0.7844365313649178,
+      "epoch": 0.21987120515179392,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0023868419229984283,
+      "learning_rate": 1e-05,
+      "loss": 0.0458,
+      "num_tokens": 189446294.0,
+      "reward": 0.515625,
+      "reward_std": 0.31246688961982727,
+      "rewards/accuracy_reward/mean": 0.515625,
+      "rewards/accuracy_reward/std": 0.5017194747924805,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000369548797607,
+      "sampling/importance_sampling_ratio/min": 0.0008047468145377934,
+      "sampling/sampling_logp_difference/max": 7.124982833862305,
+      "sampling/sampling_logp_difference/mean": 0.017401430755853653,
+      "step": 239
+    },
+    {
+      "clip_ratio/high_max": 2.887730215661577e-05,
+      "clip_ratio/high_mean": 7.219325539153942e-06,
+      "clip_ratio/low_mean": 2.826443028425274e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.548375502759882e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16196.0,
+      "completions/mean_length": 6374.8046875,
+      "completions/mean_terminated_length": 6215.9287109375,
+      "completions/min_length": 722.0,
+      "completions/min_terminated_length": 722.0,
+      "entropy": 0.9472770467400551,
+      "epoch": 0.22079116835326587,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0027549315709620714,
+      "learning_rate": 1e-05,
+      "loss": 0.0627,
+      "num_tokens": 190281461.0,
+      "reward": 0.3984375,
+      "reward_std": 0.3167053163051605,
+      "rewards/accuracy_reward/mean": 0.3984375,
+      "rewards/accuracy_reward/std": 0.4915000796318054,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998682737350464,
+      "sampling/importance_sampling_ratio/min": 7.100860239006579e-05,
+      "sampling/sampling_logp_difference/max": 9.552709579467773,
+      "sampling/sampling_logp_difference/mean": 0.020243138074874878,
+      "step": 240
+    },
+    {
+      "clip_ratio/high_max": 1.586787766427733e-05,
+      "clip_ratio/high_mean": 3.9669694160693325e-06,
+      "clip_ratio/low_mean": 2.978218674343225e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.374915604581474e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15097.0,
+      "completions/mean_length": 6654.21875,
+      "completions/mean_terminated_length": 6499.88134765625,
+      "completions/min_length": 246.0,
+      "completions/min_terminated_length": 246.0,
+      "entropy": 1.0028243213891983,
+      "epoch": 0.22171113155473782,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0013344973558560014,
+      "learning_rate": 1e-05,
+      "loss": 0.0184,
+      "num_tokens": 191156249.0,
+      "reward": 0.359375,
+      "reward_std": 0.22832971811294556,
+      "rewards/accuracy_reward/mean": 0.359375,
+      "rewards/accuracy_reward/std": 0.481702595949173,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999619722366333,
+      "sampling/importance_sampling_ratio/min": 0.0021875568199902773,
+      "sampling/sampling_logp_difference/max": 6.124969959259033,
+      "sampling/sampling_logp_difference/mean": 0.020470600575208664,
+      "step": 241
+    },
+    {
+      "clip_ratio/high_max": 1.681529829511419e-05,
+      "clip_ratio/high_mean": 4.9954849146160996e-06,
+      "clip_ratio/low_mean": 2.040554932136729e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.5401033553862362e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16172.0,
+      "completions/mean_length": 6767.7890625,
+      "completions/mean_terminated_length": 6537.00048828125,
+      "completions/min_length": 132.0,
+      "completions/min_terminated_length": 132.0,
+      "entropy": 0.9059296399354935,
+      "epoch": 0.22263109475620976,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0016136945923790336,
+      "learning_rate": 1e-05,
+      "loss": 0.0816,
+      "num_tokens": 192040526.0,
+      "reward": 0.4921875,
+      "reward_std": 0.2909066081047058,
+      "rewards/accuracy_reward/mean": 0.4921875,
+      "rewards/accuracy_reward/std": 0.5019033551216125,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999668598175049,
+      "sampling/importance_sampling_ratio/min": 1.2452921509975567e-05,
+      "sampling/sampling_logp_difference/max": 11.29355525970459,
+      "sampling/sampling_logp_difference/mean": 0.020058143883943558,
+      "step": 242
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 2.9821966563758906e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.9821966563758906e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 16275.0,
+      "completions/max_terminated_length": 16275.0,
+      "completions/mean_length": 6767.4921875,
+      "completions/mean_terminated_length": 6767.4921875,
+      "completions/min_length": 998.0,
+      "completions/min_terminated_length": 998.0,
+      "entropy": 1.0446822568774223,
+      "epoch": 0.22355105795768168,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.002869367366656661,
+      "learning_rate": 1e-05,
+      "loss": 0.0212,
+      "num_tokens": 192926469.0,
+      "reward": 0.3828125,
+      "reward_std": 0.2517249882221222,
+      "rewards/accuracy_reward/mean": 0.3828125,
+      "rewards/accuracy_reward/std": 0.4879830479621887,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999586343765259,
+      "sampling/importance_sampling_ratio/min": 1.9328599591972306e-05,
+      "sampling/sampling_logp_difference/max": 10.853924751281738,
+      "sampling/sampling_logp_difference/mean": 0.021512050181627274,
+      "step": 243
+    },
+    {
+      "clip_ratio/high_max": 3.44581130775623e-05,
+      "clip_ratio/high_mean": 1.3001711295146379e-05,
+      "clip_ratio/low_mean": 3.6407937841431703e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.940964981869911e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 16261.0,
+      "completions/max_terminated_length": 16261.0,
+      "completions/mean_length": 5738.484375,
+      "completions/mean_terminated_length": 5738.484375,
+      "completions/min_length": 184.0,
+      "completions/min_terminated_length": 184.0,
+      "entropy": 0.8617956340312958,
+      "epoch": 0.22447102115915363,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.002177527640014887,
+      "learning_rate": 1e-05,
+      "loss": -0.0189,
+      "num_tokens": 193678859.0,
+      "reward": 0.5546875,
+      "reward_std": 0.33220988512039185,
+      "rewards/accuracy_reward/mean": 0.5546875,
+      "rewards/accuracy_reward/std": 0.4989531338214874,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999570846557617,
+      "sampling/importance_sampling_ratio/min": 0.0008533780346624553,
+      "sampling/sampling_logp_difference/max": 7.06630802154541,
+      "sampling/sampling_logp_difference/mean": 0.018141131848096848,
+      "step": 244
+    },
+    {
+      "clip_ratio/high_max": 3.861003733618418e-06,
+      "clip_ratio/high_mean": 9.652509334046044e-07,
+      "clip_ratio/low_mean": 2.7767115511778684e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.8732366558870126e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15595.0,
+      "completions/mean_length": 6382.90625,
+      "completions/mean_terminated_length": 5976.357421875,
+      "completions/min_length": 464.0,
+      "completions/min_terminated_length": 464.0,
+      "entropy": 0.8692388981580734,
+      "epoch": 0.22539098436062557,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.004127771593630314,
+      "learning_rate": 1e-05,
+      "loss": 0.0572,
+      "num_tokens": 194511847.0,
+      "reward": 0.4140625,
+      "reward_std": 0.2767002582550049,
+      "rewards/accuracy_reward/mean": 0.4140625,
+      "rewards/accuracy_reward/std": 0.49449479579925537,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998810291290283,
+      "sampling/importance_sampling_ratio/min": 5.4239239943854045e-06,
+      "sampling/sampling_logp_difference/max": 12.124691009521484,
+      "sampling/sampling_logp_difference/mean": 0.018376430496573448,
+      "step": 245
+    },
+    {
+      "clip_ratio/high_max": 9.728395525598899e-06,
+      "clip_ratio/high_mean": 2.4320988813997246e-06,
+      "clip_ratio/low_mean": 5.3631663831765763e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.606376271316549e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 14504.0,
+      "completions/max_terminated_length": 14504.0,
+      "completions/mean_length": 5776.15625,
+      "completions/mean_terminated_length": 5776.15625,
+      "completions/min_length": 1018.0,
+      "completions/min_terminated_length": 1018.0,
+      "entropy": 1.1195004731416702,
+      "epoch": 0.22631094756209752,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.00263008801266551,
+      "learning_rate": 1e-05,
+      "loss": 0.0687,
+      "num_tokens": 195270051.0,
+      "reward": 0.421875,
+      "reward_std": 0.3618982434272766,
+      "rewards/accuracy_reward/mean": 0.421875,
+      "rewards/accuracy_reward/std": 0.4957992732524872,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999971866607666,
+      "sampling/importance_sampling_ratio/min": 0.005209421273320913,
+      "sampling/sampling_logp_difference/max": 5.257286548614502,
+      "sampling/sampling_logp_difference/mean": 0.019923292100429535,
+      "step": 246
+    },
+    {
+      "clip_ratio/high_max": 1.2701100786216557e-05,
+      "clip_ratio/high_mean": 3.1752751965541393e-06,
+      "clip_ratio/low_mean": 4.2162768181697174e-05,
+      "clip_ratio/low_min": 3.873926743835909e-06,
+      "clip_ratio/region_mean": 4.5338043378251314e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15203.0,
+      "completions/mean_length": 7411.421875,
+      "completions/mean_terminated_length": 7196.08056640625,
+      "completions/min_length": 455.0,
+      "completions/min_terminated_length": 455.0,
+      "entropy": 0.9801053553819656,
+      "epoch": 0.22723091076356947,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.002642859937623143,
+      "learning_rate": 1e-05,
+      "loss": 0.07,
+      "num_tokens": 196240913.0,
+      "reward": 0.390625,
+      "reward_std": 0.27328529953956604,
+      "rewards/accuracy_reward/mean": 0.390625,
+      "rewards/accuracy_reward/std": 0.4898075461387634,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999198913574219,
+      "sampling/importance_sampling_ratio/min": 0.00017500204558018595,
+      "sampling/sampling_logp_difference/max": 8.650712966918945,
+      "sampling/sampling_logp_difference/mean": 0.021511007100343704,
+      "step": 247
+    },
+    {
+      "clip_ratio/high_max": 1.5122936929401476e-05,
+      "clip_ratio/high_mean": 3.780734232350369e-06,
+      "clip_ratio/low_mean": 6.367217611114029e-05,
+      "clip_ratio/low_min": 4.8010447244450916e-06,
+      "clip_ratio/region_mean": 6.745291057086433e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16127.0,
+      "completions/mean_length": 7944.65625,
+      "completions/mean_terminated_length": 7742.1123046875,
+      "completions/min_length": 144.0,
+      "completions/min_terminated_length": 144.0,
+      "entropy": 1.0132562816143036,
+      "epoch": 0.2281508739650414,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.002439325675368309,
+      "learning_rate": 1e-05,
+      "loss": 0.0564,
+      "num_tokens": 197278517.0,
+      "reward": 0.34375,
+      "reward_std": 0.3161812424659729,
+      "rewards/accuracy_reward/mean": 0.34375,
+      "rewards/accuracy_reward/std": 0.47682511806488037,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999248385429382,
+      "sampling/importance_sampling_ratio/min": 1.0140610356756952e-05,
+      "sampling/sampling_logp_difference/max": 11.49896240234375,
+      "sampling/sampling_logp_difference/mean": 0.02124868705868721,
+      "step": 248
+    },
+    {
+      "clip_ratio/high_max": 2.6017536356448545e-05,
+      "clip_ratio/high_mean": 6.504384089112136e-06,
+      "clip_ratio/low_mean": 3.7791321346958284e-05,
+      "clip_ratio/low_min": 3.2110563097376144e-06,
+      "clip_ratio/region_mean": 4.429570503816649e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16317.0,
+      "completions/mean_length": 7550.0,
+      "completions/mean_terminated_length": 7409.7783203125,
+      "completions/min_length": 1469.0,
+      "completions/min_terminated_length": 1469.0,
+      "entropy": 1.0384011715650558,
+      "epoch": 0.22907083716651333,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0014879995724186301,
+      "learning_rate": 1e-05,
+      "loss": 0.0338,
+      "num_tokens": 198265589.0,
+      "reward": 0.3359375,
+      "reward_std": 0.24040167033672333,
+      "rewards/accuracy_reward/mean": 0.3359375,
+      "rewards/accuracy_reward/std": 0.47417303919792175,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999468922615051,
+      "sampling/importance_sampling_ratio/min": 8.418659126618877e-05,
+      "sampling/sampling_logp_difference/max": 9.382474899291992,
+      "sampling/sampling_logp_difference/mean": 0.021503347903490067,
+      "step": 249
+    },
+    {
+      "clip_ratio/high_max": 1.3615457191917812e-05,
+      "clip_ratio/high_mean": 4.491880531531933e-06,
+      "clip_ratio/low_mean": 3.916533574965797e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.365721684962409e-05,
+      "completions/clipped_ratio": 0.0703125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16221.0,
+      "completions/mean_length": 8140.9140625,
+      "completions/mean_terminated_length": 7517.48779296875,
+      "completions/min_length": 837.0,
+      "completions/min_terminated_length": 837.0,
+      "entropy": 0.8718572407960892,
+      "epoch": 0.22999080036798528,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.002340668346732855,
+      "learning_rate": 1e-05,
+      "loss": 0.0585,
+      "num_tokens": 199324938.0,
+      "reward": 0.453125,
+      "reward_std": 0.35824596881866455,
+      "rewards/accuracy_reward/mean": 0.453125,
+      "rewards/accuracy_reward/std": 0.4997538626194,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999454021453857,
+      "sampling/importance_sampling_ratio/min": 0.002325017238035798,
+      "sampling/sampling_logp_difference/max": 6.064027786254883,
+      "sampling/sampling_logp_difference/mean": 0.019466478377580643,
+      "step": 250
+    },
+    {
+      "clip_ratio/high_max": 2.2175697040438536e-05,
+      "clip_ratio/high_mean": 5.543924260109634e-06,
+      "clip_ratio/low_mean": 4.1318608055007644e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.686253225827386e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16263.0,
+      "completions/mean_length": 6630.96875,
+      "completions/mean_terminated_length": 6396.896484375,
+      "completions/min_length": 116.0,
+      "completions/min_terminated_length": 116.0,
+      "entropy": 0.7798146530985832,
+      "epoch": 0.23091076356945722,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.001989356242120266,
+      "learning_rate": 1e-05,
+      "loss": 0.0218,
+      "num_tokens": 200189902.0,
+      "reward": 0.5625,
+      "reward_std": 0.2987973093986511,
+      "rewards/accuracy_reward/mean": 0.5625,
+      "rewards/accuracy_reward/std": 0.49802759289741516,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999474883079529,
+      "sampling/importance_sampling_ratio/min": 0.0003315774374641478,
+      "sampling/sampling_logp_difference/max": 8.011649131774902,
+      "sampling/sampling_logp_difference/mean": 0.01849902793765068,
+      "step": 251
+    },
+    {
+      "clip_ratio/high_max": 3.325706302348408e-06,
+      "clip_ratio/high_mean": 8.31426575587102e-07,
+      "clip_ratio/low_mean": 2.0285911205064622e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.111733795118198e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15357.0,
+      "completions/max_terminated_length": 15357.0,
+      "completions/mean_length": 6582.203125,
+      "completions/mean_terminated_length": 6582.203125,
+      "completions/min_length": 593.0,
+      "completions/min_terminated_length": 593.0,
+      "entropy": 1.0181676000356674,
+      "epoch": 0.23183072677092917,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.002594445599243045,
+      "learning_rate": 1e-05,
+      "loss": 0.0232,
+      "num_tokens": 201052832.0,
+      "reward": 0.34375,
+      "reward_std": 0.25460314750671387,
+      "rewards/accuracy_reward/mean": 0.34375,
+      "rewards/accuracy_reward/std": 0.47682511806488037,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999495148658752,
+      "sampling/importance_sampling_ratio/min": 0.0003853558446280658,
+      "sampling/sampling_logp_difference/max": 7.8613433837890625,
+      "sampling/sampling_logp_difference/mean": 0.021598614752292633,
+      "step": 252
+    },
+    {
+      "clip_ratio/high_max": 2.2044430352252675e-05,
+      "clip_ratio/high_mean": 5.511107588063169e-06,
+      "clip_ratio/low_mean": 3.4155824209847196e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.96669319115972e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 14540.0,
+      "completions/max_terminated_length": 14540.0,
+      "completions/mean_length": 6145.1796875,
+      "completions/mean_terminated_length": 6145.1796875,
+      "completions/min_length": 1098.0,
+      "completions/min_terminated_length": 1098.0,
+      "entropy": 0.9084350541234016,
+      "epoch": 0.23275068997240111,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.003104996867477894,
+      "learning_rate": 1e-05,
+      "loss": 0.0811,
+      "num_tokens": 201858047.0,
+      "reward": 0.5078125,
+      "reward_std": 0.33220985531806946,
+      "rewards/accuracy_reward/mean": 0.5078125,
+      "rewards/accuracy_reward/std": 0.5019033551216125,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.000011682510376,
+      "sampling/importance_sampling_ratio/min": 0.007650630082935095,
+      "sampling/sampling_logp_difference/max": 4.87296724319458,
+      "sampling/sampling_logp_difference/mean": 0.018979094922542572,
+      "step": 253
+    },
+    {
+      "clip_ratio/high_max": 2.9959978519400465e-05,
+      "clip_ratio/high_mean": 7.489994629850116e-06,
+      "clip_ratio/low_mean": 3.5255963325653283e-05,
+      "clip_ratio/low_min": 2.973075879708631e-06,
+      "clip_ratio/region_mean": 4.274595892184152e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15745.0,
+      "completions/max_terminated_length": 15745.0,
+      "completions/mean_length": 7259.953125,
+      "completions/mean_terminated_length": 7259.953125,
+      "completions/min_length": 960.0,
+      "completions/min_terminated_length": 960.0,
+      "entropy": 0.9823614731431007,
+      "epoch": 0.23367065317387303,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.003212577663362026,
+      "learning_rate": 1e-05,
+      "loss": 0.0133,
+      "num_tokens": 202807673.0,
+      "reward": 0.4765625,
+      "reward_std": 0.3056321144104004,
+      "rewards/accuracy_reward/mean": 0.4765625,
+      "rewards/accuracy_reward/std": 0.5014128684997559,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999860405921936,
+      "sampling/importance_sampling_ratio/min": 0.000536504783667624,
+      "sampling/sampling_logp_difference/max": 7.530435085296631,
+      "sampling/sampling_logp_difference/mean": 0.021432969719171524,
+      "step": 254
+    },
+    {
+      "clip_ratio/high_max": 3.273996276220714e-05,
+      "clip_ratio/high_mean": 9.095591565255745e-06,
+      "clip_ratio/low_mean": 2.9539680099333054e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.8635271948805894e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16369.0,
+      "completions/mean_length": 7258.71875,
+      "completions/mean_terminated_length": 7113.87353515625,
+      "completions/min_length": 4.0,
+      "completions/min_terminated_length": 4.0,
+      "entropy": 0.8823810070753098,
+      "epoch": 0.23459061637534498,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.001418307889252901,
+      "learning_rate": 1e-05,
+      "loss": 0.0411,
+      "num_tokens": 203757333.0,
+      "reward": 0.40625,
+      "reward_std": 0.3048579692840576,
+      "rewards/accuracy_reward/mean": 0.40625,
+      "rewards/accuracy_reward/std": 0.4930621087551117,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999884963035583,
+      "sampling/importance_sampling_ratio/min": 0.0006408974295482039,
+      "sampling/sampling_logp_difference/max": 7.3526411056518555,
+      "sampling/sampling_logp_difference/mean": 0.019296500831842422,
+      "step": 255
+    },
+    {
+      "clip_ratio/high_max": 1.544119368190877e-05,
+      "clip_ratio/high_mean": 3.860298420477193e-06,
+      "clip_ratio/low_mean": 3.755458698151415e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.141488631148604e-05,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15605.0,
+      "completions/mean_length": 7011.40625,
+      "completions/mean_terminated_length": 6386.56689453125,
+      "completions/min_length": 685.0,
+      "completions/min_terminated_length": 685.0,
+      "entropy": 0.8057166337966919,
+      "epoch": 0.23551057957681693,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.001652427832596004,
+      "learning_rate": 1e-05,
+      "loss": 0.0459,
+      "num_tokens": 204675065.0,
+      "reward": 0.46875,
+      "reward_std": 0.24146251380443573,
+      "rewards/accuracy_reward/mean": 0.46875,
+      "rewards/accuracy_reward/std": 0.5009832978248596,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999918937683105,
+      "sampling/importance_sampling_ratio/min": 0.015319154597818851,
+      "sampling/sampling_logp_difference/max": 4.178651332855225,
+      "sampling/sampling_logp_difference/mean": 0.018787402659654617,
+      "step": 256
+    },
+    {
+      "clip_ratio/high_max": 5.222041181696113e-06,
+      "clip_ratio/high_mean": 2.209917965956265e-06,
+      "clip_ratio/low_mean": 4.0701652551433654e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.291157006264257e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 14796.0,
+      "completions/max_terminated_length": 14796.0,
+      "completions/mean_length": 6243.4296875,
+      "completions/mean_terminated_length": 6243.4296875,
+      "completions/min_length": 1023.0,
+      "completions/min_terminated_length": 1023.0,
+      "entropy": 0.9856048971414566,
+      "epoch": 0.23643054277828887,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.001482579973526299,
+      "learning_rate": 1e-05,
+      "loss": 0.0677,
+      "num_tokens": 205494344.0,
+      "reward": 0.5390625,
+      "reward_std": 0.28930407762527466,
+      "rewards/accuracy_reward/mean": 0.5390625,
+      "rewards/accuracy_reward/std": 0.5004304051399231,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998942613601685,
+      "sampling/importance_sampling_ratio/min": 0.0004254466330166906,
+      "sampling/sampling_logp_difference/max": 7.762371063232422,
+      "sampling/sampling_logp_difference/mean": 0.019727632403373718,
+      "step": 257
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 6.842733455414418e-05,
+      "clip_ratio/low_min": 9.297655878981459e-06,
+      "clip_ratio/region_mean": 6.842733455414418e-05,
+      "completions/clipped_ratio": 0.0546875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15485.0,
+      "completions/mean_length": 7122.2421875,
+      "completions/mean_terminated_length": 6586.4375,
+      "completions/min_length": 445.0,
+      "completions/min_terminated_length": 445.0,
+      "entropy": 0.8625433370471001,
+      "epoch": 0.23735050597976082,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.002006452763453126,
+      "learning_rate": 1e-05,
+      "loss": 0.0312,
+      "num_tokens": 206428775.0,
+      "reward": 0.40625,
+      "reward_std": 0.2987973093986511,
+      "rewards/accuracy_reward/mean": 0.40625,
+      "rewards/accuracy_reward/std": 0.4930621087551117,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999338388442993,
+      "sampling/importance_sampling_ratio/min": 0.00010911409481195733,
+      "sampling/sampling_logp_difference/max": 9.123116493225098,
+      "sampling/sampling_logp_difference/mean": 0.01927522011101246,
+      "step": 258
+    },
+    {
+      "clip_ratio/high_max": 2.887607206503162e-05,
+      "clip_ratio/high_mean": 7.219018016257905e-06,
+      "clip_ratio/low_mean": 2.7790995090981596e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.501001378936053e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15254.0,
+      "completions/mean_length": 7965.2734375,
+      "completions/mean_terminated_length": 7623.6826171875,
+      "completions/min_length": 399.0,
+      "completions/min_terminated_length": 399.0,
+      "entropy": 1.0068430602550507,
+      "epoch": 0.23827046918123276,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0029176415409892797,
+      "learning_rate": 1e-05,
+      "loss": 0.0179,
+      "num_tokens": 207469586.0,
+      "reward": 0.3828125,
+      "reward_std": 0.2212003916501999,
+      "rewards/accuracy_reward/mean": 0.3828125,
+      "rewards/accuracy_reward/std": 0.4879830479621887,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998945593833923,
+      "sampling/importance_sampling_ratio/min": 4.06005028708023e-06,
+      "sampling/sampling_logp_difference/max": 12.414315223693848,
+      "sampling/sampling_logp_difference/mean": 0.02198987640440464,
+      "step": 259
+    },
+    {
+      "clip_ratio/high_max": 8.710998599781306e-06,
+      "clip_ratio/high_mean": 2.1777496499453264e-06,
+      "clip_ratio/low_mean": 4.1899779091636447e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.407752874158177e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15331.0,
+      "completions/mean_length": 6329.4296875,
+      "completions/mean_terminated_length": 6169.83349609375,
+      "completions/min_length": 160.0,
+      "completions/min_terminated_length": 160.0,
+      "entropy": 0.9399363100528717,
+      "epoch": 0.23919043238270468,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0019115234026685357,
+      "learning_rate": 1e-05,
+      "loss": 0.0399,
+      "num_tokens": 208300217.0,
+      "reward": 0.4375,
+      "reward_std": 0.2580229938030243,
+      "rewards/accuracy_reward/mean": 0.4375,
+      "rewards/accuracy_reward/std": 0.49802759289741516,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000567436218262,
+      "sampling/importance_sampling_ratio/min": 2.1449603082146496e-05,
+      "sampling/sampling_logp_difference/max": 10.749804496765137,
+      "sampling/sampling_logp_difference/mean": 0.020002204924821854,
+      "step": 260
+    },
+    {
+      "clip_ratio/high_max": 2.536784450057894e-05,
+      "clip_ratio/high_mean": 6.341961125144735e-06,
+      "clip_ratio/low_mean": 5.959111433639919e-05,
+      "clip_ratio/low_min": 1.1521060741870315e-05,
+      "clip_ratio/region_mean": 6.593307591629127e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15635.0,
+      "completions/mean_length": 6747.90625,
+      "completions/mean_terminated_length": 6594.95263671875,
+      "completions/min_length": 350.0,
+      "completions/min_terminated_length": 350.0,
+      "entropy": 0.9575144425034523,
+      "epoch": 0.24011039558417663,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.003766207257285714,
+      "learning_rate": 1e-05,
+      "loss": 0.0667,
+      "num_tokens": 209181077.0,
+      "reward": 0.4375,
+      "reward_std": 0.3164137303829193,
+      "rewards/accuracy_reward/mean": 0.4375,
+      "rewards/accuracy_reward/std": 0.49802759289741516,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999313354492188,
+      "sampling/importance_sampling_ratio/min": 1.250743298442103e-05,
+      "sampling/sampling_logp_difference/max": 11.28918743133545,
+      "sampling/sampling_logp_difference/mean": 0.020067427307367325,
+      "step": 261
+    },
+    {
+      "clip_ratio/high_max": 2.0626074274332495e-05,
+      "clip_ratio/high_mean": 5.156518568583124e-06,
+      "clip_ratio/low_mean": 5.808068385704246e-05,
+      "clip_ratio/low_min": 1.0360539818066172e-05,
+      "clip_ratio/region_mean": 6.32372018571914e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16218.0,
+      "completions/mean_length": 6426.6953125,
+      "completions/mean_terminated_length": 6348.29150390625,
+      "completions/min_length": 767.0,
+      "completions/min_terminated_length": 767.0,
+      "entropy": 0.87480478733778,
+      "epoch": 0.24103035878564857,
+      "frac_reward_zero_std": 0.125,
+      "grad_norm": 0.002375675830990076,
+      "learning_rate": 1e-05,
+      "loss": 0.0752,
+      "num_tokens": 210023702.0,
+      "reward": 0.5078125,
+      "reward_std": 0.38900789618492126,
+      "rewards/accuracy_reward/mean": 0.5078125,
+      "rewards/accuracy_reward/std": 0.5019033551216125,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999383687973022,
+      "sampling/importance_sampling_ratio/min": 0.00024259372730739415,
+      "sampling/sampling_logp_difference/max": 8.324122428894043,
+      "sampling/sampling_logp_difference/mean": 0.018864646553993225,
+      "step": 262
+    },
+    {
+      "clip_ratio/high_max": 4.462851393327583e-06,
+      "clip_ratio/high_mean": 1.1157128483318957e-06,
+      "clip_ratio/low_mean": 3.8966268334661436e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.008198141036701e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16022.0,
+      "completions/mean_length": 7223.1484375,
+      "completions/mean_terminated_length": 6927.63671875,
+      "completions/min_length": 1015.0,
+      "completions/min_terminated_length": 1015.0,
+      "entropy": 1.0218688547611237,
+      "epoch": 0.24195032198712052,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0016257674433290958,
+      "learning_rate": 1e-05,
+      "loss": 0.0791,
+      "num_tokens": 210969921.0,
+      "reward": 0.4609375,
+      "reward_std": 0.2896084189414978,
+      "rewards/accuracy_reward/mean": 0.4609375,
+      "rewards/accuracy_reward/std": 0.5004304051399231,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999514818191528,
+      "sampling/importance_sampling_ratio/min": 9.193710138788447e-05,
+      "sampling/sampling_logp_difference/max": 9.294405937194824,
+      "sampling/sampling_logp_difference/mean": 0.02119653858244419,
+      "step": 263
+    },
+    {
+      "clip_ratio/high_max": 1.2653464409595472e-05,
+      "clip_ratio/high_mean": 3.163366102398868e-06,
+      "clip_ratio/low_mean": 4.864477250521304e-05,
+      "clip_ratio/low_min": 8.641252861707471e-06,
+      "clip_ratio/region_mean": 5.1808138323394815e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15180.0,
+      "completions/max_terminated_length": 15180.0,
+      "completions/mean_length": 6974.0703125,
+      "completions/mean_terminated_length": 6974.0703125,
+      "completions/min_length": 2.0,
+      "completions/min_terminated_length": 2.0,
+      "entropy": 0.9265539348125458,
+      "epoch": 0.24287028518859247,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0023448490537703037,
+      "learning_rate": 1e-05,
+      "loss": 0.0567,
+      "num_tokens": 211884866.0,
+      "reward": 0.390625,
+      "reward_std": 0.2885475754737854,
+      "rewards/accuracy_reward/mean": 0.390625,
+      "rewards/accuracy_reward/std": 0.4898075461387634,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000474452972412,
+      "sampling/importance_sampling_ratio/min": 0.0007677432149648666,
+      "sampling/sampling_logp_difference/max": 7.172055244445801,
+      "sampling/sampling_logp_difference/mean": 0.020384611561894417,
+      "step": 264
+    },
+    {
+      "clip_ratio/high_max": 1.1967917316724197e-05,
+      "clip_ratio/high_mean": 2.9919793291810493e-06,
+      "clip_ratio/low_mean": 3.179497366545547e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.478695157355105e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15657.0,
+      "completions/mean_length": 7247.2734375,
+      "completions/mean_terminated_length": 7027.9921875,
+      "completions/min_length": 103.0,
+      "completions/min_terminated_length": 103.0,
+      "entropy": 0.9756898358464241,
+      "epoch": 0.24379024839006438,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.003212807234376669,
+      "learning_rate": 1e-05,
+      "loss": 0.0484,
+      "num_tokens": 212833933.0,
+      "reward": 0.328125,
+      "reward_std": 0.2398776412010193,
+      "rewards/accuracy_reward/mean": 0.328125,
+      "rewards/accuracy_reward/std": 0.4713755249977112,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999449253082275,
+      "sampling/importance_sampling_ratio/min": 0.001600456889718771,
+      "sampling/sampling_logp_difference/max": 6.437466144561768,
+      "sampling/sampling_logp_difference/mean": 0.0199666079133749,
+      "step": 265
+    },
+    {
+      "clip_ratio/high_max": 1.1404694760130951e-05,
+      "clip_ratio/high_mean": 3.887520392709121e-06,
+      "clip_ratio/low_mean": 4.0242122167910566e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.4129643583801226e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15575.0,
+      "completions/mean_length": 7773.9296875,
+      "completions/mean_terminated_length": 7423.9267578125,
+      "completions/min_length": 568.0,
+      "completions/min_terminated_length": 568.0,
+      "entropy": 0.9765531942248344,
+      "epoch": 0.24471021159153633,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0019600428640842438,
+      "learning_rate": 1e-05,
+      "loss": 0.0357,
+      "num_tokens": 213848508.0,
+      "reward": 0.3984375,
+      "reward_std": 0.3129909336566925,
+      "rewards/accuracy_reward/mean": 0.3984375,
+      "rewards/accuracy_reward/std": 0.4915000796318054,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000061988830566,
+      "sampling/importance_sampling_ratio/min": 2.430168751743622e-05,
+      "sampling/sampling_logp_difference/max": 10.624964714050293,
+      "sampling/sampling_logp_difference/mean": 0.020565161481499672,
+      "step": 266
+    },
+    {
+      "clip_ratio/high_max": 6.725708999510971e-06,
+      "clip_ratio/high_mean": 1.6814272498777427e-06,
+      "clip_ratio/low_mean": 2.869901106805628e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.0380438261090603e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15583.0,
+      "completions/mean_length": 6722.5,
+      "completions/mean_terminated_length": 6569.14306640625,
+      "completions/min_length": 1021.0,
+      "completions/min_terminated_length": 1021.0,
+      "entropy": 0.9291529878973961,
+      "epoch": 0.24563017479300828,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.0014550165506079793,
+      "learning_rate": 1e-05,
+      "loss": 0.0235,
+      "num_tokens": 214731180.0,
+      "reward": 0.4921875,
+      "reward_std": 0.19332444667816162,
+      "rewards/accuracy_reward/mean": 0.4921875,
+      "rewards/accuracy_reward/std": 0.5019033551216125,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999178647994995,
+      "sampling/importance_sampling_ratio/min": 0.007400285452604294,
+      "sampling/sampling_logp_difference/max": 4.90623664855957,
+      "sampling/sampling_logp_difference/mean": 0.020057080313563347,
+      "step": 267
+    },
+    {
+      "clip_ratio/high_max": 1.8797170469042612e-05,
+      "clip_ratio/high_mean": 6.827749643889547e-06,
+      "clip_ratio/low_mean": 3.448591337473772e-05,
+      "clip_ratio/low_min": 4.687090040533803e-06,
+      "clip_ratio/region_mean": 4.1313662677566754e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15797.0,
+      "completions/max_terminated_length": 15797.0,
+      "completions/mean_length": 7001.8671875,
+      "completions/mean_terminated_length": 7001.8671875,
+      "completions/min_length": 930.0,
+      "completions/min_terminated_length": 930.0,
+      "entropy": 1.0746883526444435,
+      "epoch": 0.24655013799448022,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.002483292715623975,
+      "learning_rate": 1e-05,
+      "loss": 0.048,
+      "num_tokens": 215645819.0,
+      "reward": 0.3515625,
+      "reward_std": 0.32955142855644226,
+      "rewards/accuracy_reward/mean": 0.3515625,
+      "rewards/accuracy_reward/std": 0.4793342351913452,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999587535858154,
+      "sampling/importance_sampling_ratio/min": 1.0195622053288389e-05,
+      "sampling/sampling_logp_difference/max": 11.493552207946777,
+      "sampling/sampling_logp_difference/mean": 0.020808640867471695,
+      "step": 268
+    },
+    {
+      "clip_ratio/high_max": 8.748068921704544e-06,
+      "clip_ratio/high_mean": 2.187017230426136e-06,
+      "clip_ratio/low_mean": 8.762007928453386e-05,
+      "clip_ratio/low_min": 2.3698836685071e-05,
+      "clip_ratio/region_mean": 8.980709480965743e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14189.0,
+      "completions/mean_length": 6663.796875,
+      "completions/mean_terminated_length": 6509.50830078125,
+      "completions/min_length": 1148.0,
+      "completions/min_terminated_length": 1148.0,
+      "entropy": 1.0000900849699974,
+      "epoch": 0.24747010119595217,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.0015696679474785924,
+      "learning_rate": 1e-05,
+      "loss": 0.0731,
+      "num_tokens": 216519369.0,
+      "reward": 0.3671875,
+      "reward_std": 0.3214311897754669,
+      "rewards/accuracy_reward/mean": 0.3671875,
+      "rewards/accuracy_reward/std": 0.4839322865009308,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9997707605361938,
+      "sampling/importance_sampling_ratio/min": 1.288027192458685e-06,
+      "sampling/sampling_logp_difference/max": 13.562398910522461,
+      "sampling/sampling_logp_difference/mean": 0.022182684391736984,
+      "step": 269
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 4.567897690321843e-05,
+      "clip_ratio/low_min": 3.287224444648018e-06,
+      "clip_ratio/region_mean": 4.567897690321843e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16381.0,
+      "completions/mean_length": 6978.7421875,
+      "completions/mean_terminated_length": 6829.45263671875,
+      "completions/min_length": 1661.0,
+      "completions/min_terminated_length": 1661.0,
+      "entropy": 1.0845019966363907,
+      "epoch": 0.24839006439742412,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.003887100610882044,
+      "learning_rate": 1e-05,
+      "loss": 0.1076,
+      "num_tokens": 217432432.0,
+      "reward": 0.3671875,
+      "reward_std": 0.3124619722366333,
+      "rewards/accuracy_reward/mean": 0.3671875,
+      "rewards/accuracy_reward/std": 0.4839322865009308,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999902248382568,
+      "sampling/importance_sampling_ratio/min": 0.02168075367808342,
+      "sampling/sampling_logp_difference/max": 3.8313302993774414,
+      "sampling/sampling_logp_difference/mean": 0.02127157337963581,
+      "step": 270
+    },
+    {
+      "clip_ratio/high_max": 2.444328310957644e-05,
+      "clip_ratio/high_mean": 6.11082077739411e-06,
+      "clip_ratio/low_mean": 5.1527222922231886e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.7638043699625996e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15638.0,
+      "completions/mean_length": 5903.5546875,
+      "completions/mean_terminated_length": 5652.0244140625,
+      "completions/min_length": 651.0,
+      "completions/min_terminated_length": 651.0,
+      "entropy": 0.8638224303722382,
+      "epoch": 0.24931002759889603,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.002851828932762146,
+      "learning_rate": 1e-05,
+      "loss": 0.0771,
+      "num_tokens": 218208399.0,
+      "reward": 0.4453125,
+      "reward_std": 0.3713914752006531,
+      "rewards/accuracy_reward/mean": 0.4453125,
+      "rewards/accuracy_reward/std": 0.4989531338214874,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000553131103516,
+      "sampling/importance_sampling_ratio/min": 0.000626727007329464,
+      "sampling/sampling_logp_difference/max": 7.374999523162842,
+      "sampling/sampling_logp_difference/mean": 0.01880766451358795,
+      "step": 271
+    },
+    {
+      "clip_ratio/high_max": 8.474872856822913e-06,
+      "clip_ratio/high_mean": 2.118718214205728e-06,
+      "clip_ratio/low_mean": 2.5821682072546537e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.794040096887329e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 16060.0,
+      "completions/max_terminated_length": 16060.0,
+      "completions/mean_length": 5596.7109375,
+      "completions/mean_terminated_length": 5596.7109375,
+      "completions/min_length": 87.0,
+      "completions/min_terminated_length": 87.0,
+      "entropy": 1.1127397641539574,
+      "epoch": 0.250229990800368,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0018005800666287541,
+      "learning_rate": 1e-05,
+      "loss": 0.0075,
+      "num_tokens": 218944418.0,
+      "reward": 0.4375,
+      "reward_std": 0.29485049843788147,
+      "rewards/accuracy_reward/mean": 0.4375,
+      "rewards/accuracy_reward/std": 0.49802759289741516,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000048875808716,
+      "sampling/importance_sampling_ratio/min": 0.01548748929053545,
+      "sampling/sampling_logp_difference/max": 4.167722702026367,
+      "sampling/sampling_logp_difference/mean": 0.02004322223365307,
+      "step": 272
+    },
+    {
+      "clip_ratio/high_max": 1.5034628631838132e-05,
+      "clip_ratio/high_mean": 4.925485768580984e-06,
+      "clip_ratio/low_mean": 3.539464648838475e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.032013237065257e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16362.0,
+      "completions/mean_length": 7120.109375,
+      "completions/mean_terminated_length": 7047.16552734375,
+      "completions/min_length": 816.0,
+      "completions/min_terminated_length": 816.0,
+      "entropy": 1.0697019025683403,
+      "epoch": 0.2511499540018399,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0022711476776748896,
+      "learning_rate": 1e-05,
+      "loss": 0.0126,
+      "num_tokens": 219875952.0,
+      "reward": 0.2734375,
+      "reward_std": 0.23751862347126007,
+      "rewards/accuracy_reward/mean": 0.2734375,
+      "rewards/accuracy_reward/std": 0.447474867105484,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.000036358833313,
+      "sampling/importance_sampling_ratio/min": 9.733050683280453e-05,
+      "sampling/sampling_logp_difference/max": 9.237398147583008,
+      "sampling/sampling_logp_difference/mean": 0.02110595628619194,
+      "step": 273
+    },
+    {
+      "clip_ratio/high_max": 1.0558468147792155e-05,
+      "clip_ratio/high_mean": 2.6396170369480387e-06,
+      "clip_ratio/low_mean": 3.796903268948881e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.060864915800266e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15745.0,
+      "completions/mean_length": 7623.953125,
+      "completions/mean_terminated_length": 7484.9052734375,
+      "completions/min_length": 372.0,
+      "completions/min_terminated_length": 372.0,
+      "entropy": 0.8836525157094002,
+      "epoch": 0.25206991720331184,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.002609838731586933,
+      "learning_rate": 1e-05,
+      "loss": 0.0563,
+      "num_tokens": 220871730.0,
+      "reward": 0.3046875,
+      "reward_std": 0.30061954259872437,
+      "rewards/accuracy_reward/mean": 0.3046875,
+      "rewards/accuracy_reward/std": 0.46208351850509644,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999877214431763,
+      "sampling/importance_sampling_ratio/min": 0.0015448236372321844,
+      "sampling/sampling_logp_difference/max": 6.472845554351807,
+      "sampling/sampling_logp_difference/mean": 0.019322458654642105,
+      "step": 274
+    },
+    {
+      "clip_ratio/high_max": 1.144785210271948e-05,
+      "clip_ratio/high_mean": 2.86196302567987e-06,
+      "clip_ratio/low_mean": 5.795533934360719e-05,
+      "clip_ratio/low_min": 4.49300887339632e-06,
+      "clip_ratio/region_mean": 6.081730361984228e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15557.0,
+      "completions/mean_length": 6778.71875,
+      "completions/mean_terminated_length": 6703.08642578125,
+      "completions/min_length": 1187.0,
+      "completions/min_terminated_length": 1187.0,
+      "entropy": 0.8968989998102188,
+      "epoch": 0.2529898804047838,
+      "frac_reward_zero_std": 0.0625,
+      "grad_norm": 0.00395589042454958,
+      "learning_rate": 1e-05,
+      "loss": 0.0538,
+      "num_tokens": 221761214.0,
+      "reward": 0.4921875,
+      "reward_std": 0.4032142758369446,
+      "rewards/accuracy_reward/mean": 0.4921875,
+      "rewards/accuracy_reward/std": 0.5019033551216125,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000214576721191,
+      "sampling/importance_sampling_ratio/min": 0.0011724763317033648,
+      "sampling/sampling_logp_difference/max": 6.7486371994018555,
+      "sampling/sampling_logp_difference/mean": 0.018937086686491966,
+      "step": 275
+    },
+    {
+      "clip_ratio/high_max": 2.708495139813749e-05,
+      "clip_ratio/high_mean": 7.628764933542698e-06,
+      "clip_ratio/low_mean": 3.0297362627607072e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.792612744746293e-05,
+      "completions/clipped_ratio": 0.0546875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16100.0,
+      "completions/mean_length": 7319.2578125,
+      "completions/mean_terminated_length": 6794.85107421875,
+      "completions/min_length": 1034.0,
+      "completions/min_terminated_length": 1034.0,
+      "entropy": 0.870811752974987,
+      "epoch": 0.25390984360625574,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.002063714899122715,
+      "learning_rate": 1e-05,
+      "loss": 0.0271,
+      "num_tokens": 222719287.0,
+      "reward": 0.3203125,
+      "reward_std": 0.2835301160812378,
+      "rewards/accuracy_reward/mean": 0.3203125,
+      "rewards/accuracy_reward/std": 0.4684300124645233,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999525547027588,
+      "sampling/importance_sampling_ratio/min": 2.13631665246794e-05,
+      "sampling/sampling_logp_difference/max": 10.7538423538208,
+      "sampling/sampling_logp_difference/mean": 0.019336167722940445,
+      "step": 276
+    },
+    {
+      "clip_ratio/high_max": 3.860288416035473e-06,
+      "clip_ratio/high_mean": 9.650721040088683e-07,
+      "clip_ratio/low_mean": 2.303871349340625e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.4003785597415117e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16326.0,
+      "completions/mean_length": 6207.4140625,
+      "completions/mean_terminated_length": 5879.13671875,
+      "completions/min_length": 752.0,
+      "completions/min_terminated_length": 752.0,
+      "entropy": 0.8348869979381561,
+      "epoch": 0.2548298068077277,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0023463829420506954,
+      "learning_rate": 1e-05,
+      "loss": 0.0696,
+      "num_tokens": 223533372.0,
+      "reward": 0.4375,
+      "reward_std": 0.2359210103750229,
+      "rewards/accuracy_reward/mean": 0.4375,
+      "rewards/accuracy_reward/std": 0.49802759289741516,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000433921813965,
+      "sampling/importance_sampling_ratio/min": 2.1447433027788065e-05,
+      "sampling/sampling_logp_difference/max": 10.749905586242676,
+      "sampling/sampling_logp_difference/mean": 0.018392907455563545,
+      "step": 277
+    },
+    {
+      "clip_ratio/high_max": 2.1441665467136772e-05,
+      "clip_ratio/high_mean": 5.360416366784193e-06,
+      "clip_ratio/low_mean": 5.504566888703266e-05,
+      "clip_ratio/low_min": 1.2581466762640048e-05,
+      "clip_ratio/region_mean": 6.040608514013002e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 14713.0,
+      "completions/max_terminated_length": 14713.0,
+      "completions/mean_length": 6417.2109375,
+      "completions/mean_terminated_length": 6417.2109375,
+      "completions/min_length": 981.0,
+      "completions/min_terminated_length": 981.0,
+      "entropy": 1.0232173576951027,
+      "epoch": 0.25574977000919963,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0033652919810265303,
+      "learning_rate": 1e-05,
+      "loss": 0.034,
+      "num_tokens": 224375711.0,
+      "reward": 0.390625,
+      "reward_std": 0.3169426918029785,
+      "rewards/accuracy_reward/mean": 0.390625,
+      "rewards/accuracy_reward/std": 0.4898075461387634,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999820590019226,
+      "sampling/importance_sampling_ratio/min": 0.0020559614058583975,
+      "sampling/sampling_logp_difference/max": 6.18701171875,
+      "sampling/sampling_logp_difference/mean": 0.020980924367904663,
+      "step": 278
+    },
+    {
+      "clip_ratio/high_max": 4.679544872487895e-06,
+      "clip_ratio/high_mean": 1.1698862181219738e-06,
+      "clip_ratio/low_mean": 2.818696702888701e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.9356853247008985e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15026.0,
+      "completions/max_terminated_length": 15026.0,
+      "completions/mean_length": 5275.9453125,
+      "completions/mean_terminated_length": 5275.9453125,
+      "completions/min_length": 473.0,
+      "completions/min_terminated_length": 473.0,
+      "entropy": 0.8563915193080902,
+      "epoch": 0.25666973321067155,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0025473968125879765,
+      "learning_rate": 1e-05,
+      "loss": 0.0813,
+      "num_tokens": 225070992.0,
+      "reward": 0.703125,
+      "reward_std": 0.2790592610836029,
+      "rewards/accuracy_reward/mean": 0.703125,
+      "rewards/accuracy_reward/std": 0.45867621898651123,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999873042106628,
+      "sampling/importance_sampling_ratio/min": 0.0010016229934990406,
+      "sampling/sampling_logp_difference/max": 6.906133651733398,
+      "sampling/sampling_logp_difference/mean": 0.018068701028823853,
+      "step": 279
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 4.1973035422415705e-05,
+      "clip_ratio/low_min": 6.267234766710317e-06,
+      "clip_ratio/region_mean": 4.1973035422415705e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16350.0,
+      "completions/mean_length": 7693.984375,
+      "completions/mean_terminated_length": 7556.0478515625,
+      "completions/min_length": 1349.0,
+      "completions/min_terminated_length": 1349.0,
+      "entropy": 0.7832933664321899,
+      "epoch": 0.2575896964121435,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0016663498245179653,
+      "learning_rate": 1e-05,
+      "loss": 0.0836,
+      "num_tokens": 226073822.0,
+      "reward": 0.421875,
+      "reward_std": 0.3227166533470154,
+      "rewards/accuracy_reward/mean": 0.421875,
+      "rewards/accuracy_reward/std": 0.4957992732524872,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999273419380188,
+      "sampling/importance_sampling_ratio/min": 5.893720299354754e-06,
+      "sampling/sampling_logp_difference/max": 12.04162311553955,
+      "sampling/sampling_logp_difference/mean": 0.01851016655564308,
+      "step": 280
+    },
+    {
+      "clip_ratio/high_max": 1.304801662627142e-05,
+      "clip_ratio/high_mean": 3.262004156567855e-06,
+      "clip_ratio/low_mean": 3.7096169648975774e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.035817426029098e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15207.0,
+      "completions/mean_length": 6222.4609375,
+      "completions/mean_terminated_length": 6061.1669921875,
+      "completions/min_length": 967.0,
+      "completions/min_terminated_length": 967.0,
+      "entropy": 0.8835120126605034,
+      "epoch": 0.25850965961361544,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0021045261528342962,
+      "learning_rate": 1e-05,
+      "loss": 0.055,
+      "num_tokens": 226888577.0,
+      "reward": 0.5078125,
+      "reward_std": 0.2767002284526825,
+      "rewards/accuracy_reward/mean": 0.5078125,
+      "rewards/accuracy_reward/std": 0.5019033551216125,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999616742134094,
+      "sampling/importance_sampling_ratio/min": 5.688065698450373e-07,
+      "sampling/sampling_logp_difference/max": 14.379725456237793,
+      "sampling/sampling_logp_difference/mean": 0.018851105123758316,
+      "step": 281
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 3.1754828114571865e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.1754828114571865e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16331.0,
+      "completions/mean_length": 6371.625,
+      "completions/mean_terminated_length": 6131.328125,
+      "completions/min_length": 1034.0,
+      "completions/min_terminated_length": 1034.0,
+      "entropy": 0.9026313945651054,
+      "epoch": 0.2594296228150874,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0030448357574641705,
+      "learning_rate": 1e-05,
+      "loss": 0.1009,
+      "num_tokens": 227722025.0,
+      "reward": 0.515625,
+      "reward_std": 0.2722293734550476,
+      "rewards/accuracy_reward/mean": 0.515625,
+      "rewards/accuracy_reward/std": 0.5017194747924805,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999712705612183,
+      "sampling/importance_sampling_ratio/min": 0.00016869648243300617,
+      "sampling/sampling_logp_difference/max": 8.687409400939941,
+      "sampling/sampling_logp_difference/mean": 0.018757576122879982,
+      "step": 282
+    },
+    {
+      "clip_ratio/high_max": 7.024085562079563e-06,
+      "clip_ratio/high_mean": 1.7560213905198907e-06,
+      "clip_ratio/low_mean": 3.379111592494155e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.5547137599678535e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15891.0,
+      "completions/mean_length": 7510.4921875,
+      "completions/mean_terminated_length": 7224.25,
+      "completions/min_length": 159.0,
+      "completions/min_terminated_length": 159.0,
+      "entropy": 1.044313833117485,
+      "epoch": 0.26034958601655933,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.0019295766251161695,
+      "learning_rate": 1e-05,
+      "loss": 0.0513,
+      "num_tokens": 228703256.0,
+      "reward": 0.3046875,
+      "reward_std": 0.19674429297447205,
+      "rewards/accuracy_reward/mean": 0.3046875,
+      "rewards/accuracy_reward/std": 0.46208351850509644,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999972581863403,
+      "sampling/importance_sampling_ratio/min": 0.0002186153142247349,
+      "sampling/sampling_logp_difference/max": 8.428196907043457,
+      "sampling/sampling_logp_difference/mean": 0.02207346074283123,
+      "step": 283
+    },
+    {
+      "clip_ratio/high_max": 5.068321115686558e-06,
+      "clip_ratio/high_mean": 1.2670802789216395e-06,
+      "clip_ratio/low_mean": 3.7797102550030104e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.9064182828951743e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16007.0,
+      "completions/mean_length": 7594.140625,
+      "completions/mean_terminated_length": 7524.92919921875,
+      "completions/min_length": 598.0,
+      "completions/min_terminated_length": 598.0,
+      "entropy": 0.9706612005829811,
+      "epoch": 0.2612695492180313,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.0017117204843088984,
+      "learning_rate": 1e-05,
+      "loss": 0.0748,
+      "num_tokens": 229697002.0,
+      "reward": 0.2734375,
+      "reward_std": 0.18649455904960632,
+      "rewards/accuracy_reward/mean": 0.2734375,
+      "rewards/accuracy_reward/std": 0.447474867105484,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.000016212463379,
+      "sampling/importance_sampling_ratio/min": 0.00035400164779275656,
+      "sampling/sampling_logp_difference/max": 7.946208953857422,
+      "sampling/sampling_logp_difference/mean": 0.021097885444760323,
+      "step": 284
+    },
+    {
+      "clip_ratio/high_max": 1.5618601537426002e-05,
+      "clip_ratio/high_mean": 3.904650384356501e-06,
+      "clip_ratio/low_mean": 4.570582996166195e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.961048034601845e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15562.0,
+      "completions/mean_length": 6888.9140625,
+      "completions/mean_terminated_length": 6738.19873046875,
+      "completions/min_length": 327.0,
+      "completions/min_terminated_length": 327.0,
+      "entropy": 0.9210037142038345,
+      "epoch": 0.2621895124195032,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0025933689903467894,
+      "learning_rate": 1e-05,
+      "loss": 0.0887,
+      "num_tokens": 230598679.0,
+      "reward": 0.4375,
+      "reward_std": 0.2580229938030243,
+      "rewards/accuracy_reward/mean": 0.4375,
+      "rewards/accuracy_reward/std": 0.49802759289741516,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999586939811707,
+      "sampling/importance_sampling_ratio/min": 0.0007308972999453545,
+      "sampling/sampling_logp_difference/max": 7.221237659454346,
+      "sampling/sampling_logp_difference/mean": 0.01939917542040348,
+      "step": 285
+    },
+    {
+      "clip_ratio/high_max": 2.398964193162101e-05,
+      "clip_ratio/high_mean": 6.9283565835576155e-06,
+      "clip_ratio/low_mean": 4.821338916372042e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.514174608833855e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15458.0,
+      "completions/mean_length": 6433.640625,
+      "completions/mean_terminated_length": 6355.29150390625,
+      "completions/min_length": 213.0,
+      "completions/min_terminated_length": 213.0,
+      "entropy": 1.064419962465763,
+      "epoch": 0.26310947562097514,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.0019397985888645053,
+      "learning_rate": 1e-05,
+      "loss": 0.0841,
+      "num_tokens": 231440153.0,
+      "reward": 0.375,
+      "reward_std": 0.3451131582260132,
+      "rewards/accuracy_reward/mean": 0.375,
+      "rewards/accuracy_reward/std": 0.4860251843929291,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999503493309021,
+      "sampling/importance_sampling_ratio/min": 0.019039930775761604,
+      "sampling/sampling_logp_difference/max": 3.961216926574707,
+      "sampling/sampling_logp_difference/mean": 0.021084938198328018,
+      "step": 286
+    },
+    {
+      "clip_ratio/high_max": 1.9223051822336856e-05,
+      "clip_ratio/high_mean": 6.997284344834043e-06,
+      "clip_ratio/low_mean": 5.4512621773028513e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 6.150990611786256e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14304.0,
+      "completions/mean_length": 5858.40625,
+      "completions/mean_terminated_length": 5691.33349609375,
+      "completions/min_length": 546.0,
+      "completions/min_terminated_length": 546.0,
+      "entropy": 0.8120778575539589,
+      "epoch": 0.2640294388224471,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.002288782736286521,
+      "learning_rate": 1e-05,
+      "loss": 0.0408,
+      "num_tokens": 232209485.0,
+      "reward": 0.46875,
+      "reward_std": 0.36637401580810547,
+      "rewards/accuracy_reward/mean": 0.46875,
+      "rewards/accuracy_reward/std": 0.5009832978248596,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999239444732666,
+      "sampling/importance_sampling_ratio/min": 0.00017959839897230268,
+      "sampling/sampling_logp_difference/max": 8.624787330627441,
+      "sampling/sampling_logp_difference/mean": 0.019076552242040634,
+      "step": 287
+    },
+    {
+      "clip_ratio/high_max": 9.900939403451048e-06,
+      "clip_ratio/high_mean": 3.4680233511608094e-06,
+      "clip_ratio/low_mean": 1.8137742017643177e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.1605765368803986e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15825.0,
+      "completions/mean_length": 7088.4765625,
+      "completions/mean_terminated_length": 6710.609375,
+      "completions/min_length": 688.0,
+      "completions/min_terminated_length": 688.0,
+      "entropy": 0.9231890514492989,
+      "epoch": 0.26494940202391903,
+      "frac_reward_zero_std": 0.625,
+      "grad_norm": 0.001075367210432887,
+      "learning_rate": 1e-05,
+      "loss": 0.0364,
+      "num_tokens": 233133850.0,
+      "reward": 0.5078125,
+      "reward_std": 0.18383610248565674,
+      "rewards/accuracy_reward/mean": 0.5078125,
+      "rewards/accuracy_reward/std": 0.5019033551216125,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998681545257568,
+      "sampling/importance_sampling_ratio/min": 0.005257915705442429,
+      "sampling/sampling_logp_difference/max": 5.248020648956299,
+      "sampling/sampling_logp_difference/mean": 0.019140273332595825,
+      "step": 288
+    },
+    {
+      "clip_ratio/high_max": 8.648456969240215e-06,
+      "clip_ratio/high_mean": 2.1621142423100537e-06,
+      "clip_ratio/low_mean": 1.838804723774956e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.0550161480059614e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16384.0,
+      "completions/mean_length": 6151.78125,
+      "completions/mean_terminated_length": 5906.20849609375,
+      "completions/min_length": 772.0,
+      "completions/min_terminated_length": 772.0,
+      "entropy": 0.8585417941212654,
+      "epoch": 0.265869365225391,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0015517349820584059,
+      "learning_rate": 1e-05,
+      "loss": 0.0828,
+      "num_tokens": 233940718.0,
+      "reward": 0.46875,
+      "reward_std": 0.21884137392044067,
+      "rewards/accuracy_reward/mean": 0.46875,
+      "rewards/accuracy_reward/std": 0.5009832978248596,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000255107879639,
+      "sampling/importance_sampling_ratio/min": 7.617311348440126e-05,
+      "sampling/sampling_logp_difference/max": 9.482501983642578,
+      "sampling/sampling_logp_difference/mean": 0.019276250153779984,
+      "step": 289
+    },
+    {
+      "clip_ratio/high_max": 1.1416668485253467e-05,
+      "clip_ratio/high_mean": 3.7661499732166703e-06,
+      "clip_ratio/low_mean": 2.1342358195397537e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.5108507770710276e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15133.0,
+      "completions/mean_length": 7111.2578125,
+      "completions/mean_terminated_length": 6812.13671875,
+      "completions/min_length": 35.0,
+      "completions/min_terminated_length": 35.0,
+      "entropy": 0.9735362678766251,
+      "epoch": 0.2667893284268629,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0036829947493970394,
+      "learning_rate": 1e-05,
+      "loss": 0.0642,
+      "num_tokens": 234872111.0,
+      "reward": 0.4296875,
+      "reward_std": 0.31930169463157654,
+      "rewards/accuracy_reward/mean": 0.4296875,
+      "rewards/accuracy_reward/std": 0.4969765841960907,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999943971633911,
+      "sampling/importance_sampling_ratio/min": 0.0006535807042382658,
+      "sampling/sampling_logp_difference/max": 7.333044528961182,
+      "sampling/sampling_logp_difference/mean": 0.021356046199798584,
+      "step": 290
+    },
+    {
+      "clip_ratio/high_max": 2.2526005068357335e-05,
+      "clip_ratio/high_mean": 5.631501267089334e-06,
+      "clip_ratio/low_mean": 3.30086276107977e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.864012808207917e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15995.0,
+      "completions/mean_length": 6787.671875,
+      "completions/mean_terminated_length": 6478.11279296875,
+      "completions/min_length": 1404.0,
+      "completions/min_terminated_length": 1404.0,
+      "entropy": 0.8856986835598946,
+      "epoch": 0.26770929162833484,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.00234629912301898,
+      "learning_rate": 1e-05,
+      "loss": 0.0169,
+      "num_tokens": 235759149.0,
+      "reward": 0.5390625,
+      "reward_std": 0.25354230403900146,
+      "rewards/accuracy_reward/mean": 0.5390625,
+      "rewards/accuracy_reward/std": 0.5004304051399231,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999296069145203,
+      "sampling/importance_sampling_ratio/min": 0.00035710117663256824,
+      "sampling/sampling_logp_difference/max": 7.937491416931152,
+      "sampling/sampling_logp_difference/mean": 0.01950475014746189,
+      "step": 291
+    },
+    {
+      "clip_ratio/high_max": 2.6025282068076194e-05,
+      "clip_ratio/high_mean": 6.5063205170190486e-06,
+      "clip_ratio/low_mean": 4.603358706845029e-05,
+      "clip_ratio/low_min": 4.53654638477019e-06,
+      "clip_ratio/region_mean": 5.253990843812062e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15845.0,
+      "completions/mean_length": 6757.203125,
+      "completions/mean_terminated_length": 6604.39697265625,
+      "completions/min_length": 236.0,
+      "completions/min_terminated_length": 236.0,
+      "entropy": 0.9217840805649757,
+      "epoch": 0.2686292548298068,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0034125701058655977,
+      "learning_rate": 1e-05,
+      "loss": 0.0527,
+      "num_tokens": 236643319.0,
+      "reward": 0.3515625,
+      "reward_std": 0.2896084189414978,
+      "rewards/accuracy_reward/mean": 0.3515625,
+      "rewards/accuracy_reward/std": 0.4793342351913452,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999240636825562,
+      "sampling/importance_sampling_ratio/min": 6.144329745438881e-06,
+      "sampling/sampling_logp_difference/max": 11.999980926513672,
+      "sampling/sampling_logp_difference/mean": 0.020774487406015396,
+      "step": 292
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 3.5210429246035346e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.5210429246035346e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16319.0,
+      "completions/mean_length": 6504.4375,
+      "completions/mean_terminated_length": 6185.74169921875,
+      "completions/min_length": 516.0,
+      "completions/min_terminated_length": 516.0,
+      "entropy": 1.126970261335373,
+      "epoch": 0.26954921803127874,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0020905097480863333,
+      "learning_rate": 1e-05,
+      "loss": 0.0464,
+      "num_tokens": 237495351.0,
+      "reward": 0.25,
+      "reward_std": 0.30904704332351685,
+      "rewards/accuracy_reward/mean": 0.25,
+      "rewards/accuracy_reward/std": 0.434714138507843,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000053644180298,
+      "sampling/importance_sampling_ratio/min": 0.0009940610034391284,
+      "sampling/sampling_logp_difference/max": 6.913712024688721,
+      "sampling/sampling_logp_difference/mean": 0.023218728601932526,
+      "step": 293
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 1.5693222053414502e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 1.5693222053414502e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15888.0,
+      "completions/mean_length": 5702.4140625,
+      "completions/mean_terminated_length": 5446.05615234375,
+      "completions/min_length": 867.0,
+      "completions/min_terminated_length": 867.0,
+      "entropy": 0.8803137242794037,
+      "epoch": 0.2704691812327507,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.002073790645226836,
+      "learning_rate": 1e-05,
+      "loss": 0.0066,
+      "num_tokens": 238251852.0,
+      "reward": 0.5625,
+      "reward_std": 0.2022808939218521,
+      "rewards/accuracy_reward/mean": 0.5625,
+      "rewards/accuracy_reward/std": 0.49802759289741516,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.000054955482483,
+      "sampling/importance_sampling_ratio/min": 0.016290459781885147,
+      "sampling/sampling_logp_difference/max": 4.117175579071045,
+      "sampling/sampling_logp_difference/mean": 0.0185186006128788,
+      "step": 294
+    },
+    {
+      "clip_ratio/high_max": 1.4213665508577833e-05,
+      "clip_ratio/high_mean": 4.4483959982244414e-06,
+      "clip_ratio/low_mean": 2.979715202400257e-05,
+      "clip_ratio/low_min": 4.1597336348786484e-06,
+      "clip_ratio/region_mean": 3.424554824960069e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15915.0,
+      "completions/mean_length": 7176.2890625,
+      "completions/mean_terminated_length": 6801.99169921875,
+      "completions/min_length": 654.0,
+      "completions/min_terminated_length": 654.0,
+      "entropy": 0.9554997384548187,
+      "epoch": 0.27138914443422263,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.002781527815386653,
+      "learning_rate": 1e-05,
+      "loss": 0.0908,
+      "num_tokens": 239189385.0,
+      "reward": 0.5078125,
+      "reward_std": 0.3634958863258362,
+      "rewards/accuracy_reward/mean": 0.5078125,
+      "rewards/accuracy_reward/std": 0.5019033551216125,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999794960021973,
+      "sampling/importance_sampling_ratio/min": 0.0018711343873292208,
+      "sampling/sampling_logp_difference/max": 6.281210422515869,
+      "sampling/sampling_logp_difference/mean": 0.020436719059944153,
+      "step": 295
+    },
+    {
+      "clip_ratio/high_max": 1.2612186310434481e-05,
+      "clip_ratio/high_mean": 5.171368570699997e-06,
+      "clip_ratio/low_mean": 4.8968343890010146e-05,
+      "clip_ratio/low_min": 4.0222671486844774e-06,
+      "clip_ratio/region_mean": 5.413971166490228e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16020.0,
+      "completions/mean_length": 7855.578125,
+      "completions/mean_terminated_length": 7651.2001953125,
+      "completions/min_length": 688.0,
+      "completions/min_terminated_length": 688.0,
+      "entropy": 0.9450526610016823,
+      "epoch": 0.27230910763569455,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.003132987068966031,
+      "learning_rate": 1e-05,
+      "loss": 0.0311,
+      "num_tokens": 240217715.0,
+      "reward": 0.40625,
+      "reward_std": 0.28512775897979736,
+      "rewards/accuracy_reward/mean": 0.40625,
+      "rewards/accuracy_reward/std": 0.4930621087551117,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999253153800964,
+      "sampling/importance_sampling_ratio/min": 0.0011438478250056505,
+      "sampling/sampling_logp_difference/max": 6.773357391357422,
+      "sampling/sampling_logp_difference/mean": 0.021461743861436844,
+      "step": 296
+    },
+    {
+      "clip_ratio/high_max": 2.172341964978841e-05,
+      "clip_ratio/high_mean": 6.823271291978017e-06,
+      "clip_ratio/low_mean": 3.516899266742257e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.199226441414794e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14060.0,
+      "completions/mean_length": 6240.265625,
+      "completions/mean_terminated_length": 5913.04833984375,
+      "completions/min_length": 421.0,
+      "completions/min_terminated_length": 421.0,
+      "entropy": 0.8811023011803627,
+      "epoch": 0.2732290708371665,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0028523094952106476,
+      "learning_rate": 1e-05,
+      "loss": 0.015,
+      "num_tokens": 241035133.0,
+      "reward": 0.484375,
+      "reward_std": 0.26143303513526917,
+      "rewards/accuracy_reward/mean": 0.484375,
+      "rewards/accuracy_reward/std": 0.5017194747924805,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.000007152557373,
+      "sampling/importance_sampling_ratio/min": 0.0006931954412721097,
+      "sampling/sampling_logp_difference/max": 7.274198532104492,
+      "sampling/sampling_logp_difference/mean": 0.019493088126182556,
+      "step": 297
+    },
+    {
+      "clip_ratio/high_max": 1.2606601558218244e-05,
+      "clip_ratio/high_mean": 3.151650389554561e-06,
+      "clip_ratio/low_mean": 3.768150395444536e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.08331545713736e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15795.0,
+      "completions/mean_length": 6103.203125,
+      "completions/mean_terminated_length": 6022.251953125,
+      "completions/min_length": 355.0,
+      "completions/min_terminated_length": 355.0,
+      "entropy": 0.8766692876815796,
+      "epoch": 0.27414903403863844,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0026241440791636705,
+      "learning_rate": 1e-05,
+      "loss": 0.0089,
+      "num_tokens": 241836479.0,
+      "reward": 0.453125,
+      "reward_std": 0.32589423656463623,
+      "rewards/accuracy_reward/mean": 0.453125,
+      "rewards/accuracy_reward/std": 0.4997538626194,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999925434589386,
+      "sampling/importance_sampling_ratio/min": 0.00012664205860346556,
+      "sampling/sampling_logp_difference/max": 8.974145889282227,
+      "sampling/sampling_logp_difference/mean": 0.01907728984951973,
+      "step": 298
+    },
+    {
+      "clip_ratio/high_max": 1.7400974911652156e-05,
+      "clip_ratio/high_mean": 4.350243727913039e-06,
+      "clip_ratio/low_mean": 4.527119426711579e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.962143839293276e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16169.0,
+      "completions/mean_length": 7711.0703125,
+      "completions/mean_terminated_length": 7573.4052734375,
+      "completions/min_length": 290.0,
+      "completions/min_terminated_length": 290.0,
+      "entropy": 1.0770929008722305,
+      "epoch": 0.2750689972401104,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.003654222236946225,
+      "learning_rate": 1e-05,
+      "loss": 0.0443,
+      "num_tokens": 242844376.0,
+      "reward": 0.3359375,
+      "reward_std": 0.2501322627067566,
+      "rewards/accuracy_reward/mean": 0.3359375,
+      "rewards/accuracy_reward/std": 0.47417303919792175,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999839067459106,
+      "sampling/importance_sampling_ratio/min": 0.0006267472635954618,
+      "sampling/sampling_logp_difference/max": 7.374967098236084,
+      "sampling/sampling_logp_difference/mean": 0.022012868896126747,
+      "step": 299
+    },
+    {
+      "clip_ratio/high_max": 1.4325163647299632e-05,
+      "clip_ratio/high_mean": 3.581290911824908e-06,
+      "clip_ratio/low_mean": 4.28195745598714e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.6400865016948956e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15905.0,
+      "completions/mean_length": 6616.5546875,
+      "completions/mean_terminated_length": 6539.6455078125,
+      "completions/min_length": 138.0,
+      "completions/min_terminated_length": 138.0,
+      "entropy": 0.8439916148781776,
+      "epoch": 0.27598896044158233,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0029195898678153753,
+      "learning_rate": 1e-05,
+      "loss": 0.1094,
+      "num_tokens": 243708479.0,
+      "reward": 0.453125,
+      "reward_std": 0.3516485095024109,
+      "rewards/accuracy_reward/mean": 0.453125,
+      "rewards/accuracy_reward/std": 0.4997538626194,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998997449874878,
+      "sampling/importance_sampling_ratio/min": 2.189194128732197e-05,
+      "sampling/sampling_logp_difference/max": 10.729392051696777,
+      "sampling/sampling_logp_difference/mean": 0.017992788925766945,
+      "step": 300
+    },
+    {
+      "clip_ratio/high_max": 1.848296233220026e-05,
+      "clip_ratio/high_mean": 4.620740583050065e-06,
+      "clip_ratio/low_mean": 5.01860952226707e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.480683557834709e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15286.0,
+      "completions/mean_length": 6173.5234375,
+      "completions/mean_terminated_length": 6093.1259765625,
+      "completions/min_length": 21.0,
+      "completions/min_terminated_length": 21.0,
+      "entropy": 0.8975192531943321,
+      "epoch": 0.2769089236430543,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.0017261393368244171,
+      "learning_rate": 1e-05,
+      "loss": 0.0853,
+      "num_tokens": 244515378.0,
+      "reward": 0.53125,
+      "reward_std": 0.3532412052154541,
+      "rewards/accuracy_reward/mean": 0.53125,
+      "rewards/accuracy_reward/std": 0.5009832978248596,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999533891677856,
+      "sampling/importance_sampling_ratio/min": 0.000553854217287153,
+      "sampling/sampling_logp_difference/max": 7.4986090660095215,
+      "sampling/sampling_logp_difference/mean": 0.019458644092082977,
+      "step": 301
+    },
+    {
+      "clip_ratio/high_max": 4.114005332667148e-05,
+      "clip_ratio/high_mean": 1.2276760230633954e-05,
+      "clip_ratio/low_mean": 3.397437080820964e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.625113024303573e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16032.0,
+      "completions/mean_length": 5640.90625,
+      "completions/mean_terminated_length": 5470.38134765625,
+      "completions/min_length": 359.0,
+      "completions/min_terminated_length": 359.0,
+      "entropy": 0.8833519890904427,
+      "epoch": 0.2778288868445262,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0018768958980217576,
+      "learning_rate": 1e-05,
+      "loss": 0.0731,
+      "num_tokens": 245258318.0,
+      "reward": 0.4609375,
+      "reward_std": 0.3135277330875397,
+      "rewards/accuracy_reward/mean": 0.4609375,
+      "rewards/accuracy_reward/std": 0.5004304051399231,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999516606330872,
+      "sampling/importance_sampling_ratio/min": 0.0071789538487792015,
+      "sampling/sampling_logp_difference/max": 4.936601638793945,
+      "sampling/sampling_logp_difference/mean": 0.019646335393190384,
+      "step": 302
+    },
+    {
+      "clip_ratio/high_max": 1.4196921938491869e-05,
+      "clip_ratio/high_mean": 4.514302474944998e-06,
+      "clip_ratio/low_mean": 4.4677519781544106e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.919182129015098e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16378.0,
+      "completions/mean_length": 7840.5078125,
+      "completions/mean_terminated_length": 7564.9111328125,
+      "completions/min_length": 758.0,
+      "completions/min_terminated_length": 758.0,
+      "entropy": 0.9772802665829659,
+      "epoch": 0.27874885004599814,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.002617602702230215,
+      "learning_rate": 1e-05,
+      "loss": 0.0298,
+      "num_tokens": 246280663.0,
+      "reward": 0.328125,
+      "reward_std": 0.29826050996780396,
+      "rewards/accuracy_reward/mean": 0.328125,
+      "rewards/accuracy_reward/std": 0.4713755249977112,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999324083328247,
+      "sampling/importance_sampling_ratio/min": 0.0008982301224023104,
+      "sampling/sampling_logp_difference/max": 7.015084266662598,
+      "sampling/sampling_logp_difference/mean": 0.022171074524521828,
+      "step": 303
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 1.7621316146687604e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 1.7621316146687604e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16326.0,
+      "completions/mean_length": 6316.1015625,
+      "completions/mean_terminated_length": 6074.47216796875,
+      "completions/min_length": 779.0,
+      "completions/min_terminated_length": 779.0,
+      "entropy": 0.8542795851826668,
+      "epoch": 0.2796688132474701,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0011874900665134192,
+      "learning_rate": 1e-05,
+      "loss": 0.0513,
+      "num_tokens": 247107604.0,
+      "reward": 0.3828125,
+      "reward_std": 0.2227931022644043,
+      "rewards/accuracy_reward/mean": 0.3828125,
+      "rewards/accuracy_reward/std": 0.4879830479621887,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000126361846924,
+      "sampling/importance_sampling_ratio/min": 0.00015846268797758967,
+      "sampling/sampling_logp_difference/max": 8.749991416931152,
+      "sampling/sampling_logp_difference/mean": 0.018691308796405792,
+      "step": 304
+    },
+    {
+      "clip_ratio/high_max": 3.0959752166381804e-06,
+      "clip_ratio/high_mean": 7.739938041595451e-07,
+      "clip_ratio/low_mean": 6.0967123090449604e-05,
+      "clip_ratio/low_min": 2.711407751121442e-05,
+      "clip_ratio/region_mean": 6.17411176335736e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15782.0,
+      "completions/mean_length": 6568.171875,
+      "completions/mean_terminated_length": 6412.365234375,
+      "completions/min_length": 97.0,
+      "completions/min_terminated_length": 97.0,
+      "entropy": 0.9063890501856804,
+      "epoch": 0.28058877644894203,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.002459619427099824,
+      "learning_rate": 1e-05,
+      "loss": 0.0725,
+      "num_tokens": 247967322.0,
+      "reward": 0.5,
+      "reward_std": 0.3214184641838074,
+      "rewards/accuracy_reward/mean": 0.5,
+      "rewards/accuracy_reward/std": 0.5019646286964417,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998743534088135,
+      "sampling/importance_sampling_ratio/min": 0.012350871227681637,
+      "sampling/sampling_logp_difference/max": 4.394028663635254,
+      "sampling/sampling_logp_difference/mean": 0.020134467631578445,
+      "step": 305
+    },
+    {
+      "clip_ratio/high_max": 5.9507838159333915e-06,
+      "clip_ratio/high_mean": 1.4876959539833479e-06,
+      "clip_ratio/low_mean": 2.400908408617397e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.549678004015732e-05,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15714.0,
+      "completions/mean_length": 8182.28125,
+      "completions/mean_terminated_length": 7635.50048828125,
+      "completions/min_length": 877.0,
+      "completions/min_terminated_length": 877.0,
+      "entropy": 1.0137704983353615,
+      "epoch": 0.281508739650414,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0016673406353220344,
+      "learning_rate": 1e-05,
+      "loss": 0.0244,
+      "num_tokens": 249031710.0,
+      "reward": 0.3359375,
+      "reward_std": 0.22225631773471832,
+      "rewards/accuracy_reward/mean": 0.3359375,
+      "rewards/accuracy_reward/std": 0.47417303919792175,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998995065689087,
+      "sampling/importance_sampling_ratio/min": 0.0008049134048633277,
+      "sampling/sampling_logp_difference/max": 7.1247758865356445,
+      "sampling/sampling_logp_difference/mean": 0.021704845130443573,
+      "step": 306
+    },
+    {
+      "clip_ratio/high_max": 1.4527202438330278e-05,
+      "clip_ratio/high_mean": 3.6318006095825695e-06,
+      "clip_ratio/low_mean": 3.1829216595724574e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.5461017205307144e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 14502.0,
+      "completions/max_terminated_length": 14502.0,
+      "completions/mean_length": 6460.5703125,
+      "completions/mean_terminated_length": 6460.5703125,
+      "completions/min_length": 804.0,
+      "completions/min_terminated_length": 804.0,
+      "entropy": 1.0418165400624275,
+      "epoch": 0.2824287028518859,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0022682021372020245,
+      "learning_rate": 1e-05,
+      "loss": 0.0171,
+      "num_tokens": 249881047.0,
+      "reward": 0.359375,
+      "reward_std": 0.25566887855529785,
+      "rewards/accuracy_reward/mean": 0.359375,
+      "rewards/accuracy_reward/std": 0.481702595949173,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999744296073914,
+      "sampling/importance_sampling_ratio/min": 0.002809183904901147,
+      "sampling/sampling_logp_difference/max": 5.874861240386963,
+      "sampling/sampling_logp_difference/mean": 0.02204791083931923,
+      "step": 307
+    },
+    {
+      "clip_ratio/high_max": 9.222687367582694e-06,
+      "clip_ratio/high_mean": 4.125313353142701e-06,
+      "clip_ratio/low_mean": 4.836107154915226e-05,
+      "clip_ratio/low_min": 3.4611657611094415e-06,
+      "clip_ratio/region_mean": 5.248638444754761e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14754.0,
+      "completions/mean_length": 6846.3046875,
+      "completions/mean_terminated_length": 6694.9130859375,
+      "completions/min_length": 944.0,
+      "completions/min_terminated_length": 944.0,
+      "entropy": 0.9839218333363533,
+      "epoch": 0.28334866605335784,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.002436346374452114,
+      "learning_rate": 1e-05,
+      "loss": 0.0637,
+      "num_tokens": 250773806.0,
+      "reward": 0.484375,
+      "reward_std": 0.34299150109291077,
+      "rewards/accuracy_reward/mean": 0.484375,
+      "rewards/accuracy_reward/std": 0.5017194747924805,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999980628490448,
+      "sampling/importance_sampling_ratio/min": 0.0257408544421196,
+      "sampling/sampling_logp_difference/max": 3.6596758365631104,
+      "sampling/sampling_logp_difference/mean": 0.02135510742664337,
+      "step": 308
+    },
+    {
+      "clip_ratio/high_max": 1.3327621218195418e-05,
+      "clip_ratio/high_mean": 3.3319053045488545e-06,
+      "clip_ratio/low_mean": 3.791964286392613e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.1251548054788145e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15777.0,
+      "completions/mean_length": 6558.53125,
+      "completions/mean_terminated_length": 6241.58056640625,
+      "completions/min_length": 884.0,
+      "completions/min_terminated_length": 884.0,
+      "entropy": 0.7833076938986778,
+      "epoch": 0.2842686292548298,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.002015948062762618,
+      "learning_rate": 1e-05,
+      "loss": 0.0791,
+      "num_tokens": 251633074.0,
+      "reward": 0.46875,
+      "reward_std": 0.27328526973724365,
+      "rewards/accuracy_reward/mean": 0.46875,
+      "rewards/accuracy_reward/std": 0.5009832978248596,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999434947967529,
+      "sampling/importance_sampling_ratio/min": 5.1445105782477185e-05,
+      "sampling/sampling_logp_difference/max": 9.874995231628418,
+      "sampling/sampling_logp_difference/mean": 0.017078280448913574,
+      "step": 309
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 3.3865982686620555e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.3865982686620555e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16162.0,
+      "completions/mean_length": 7626.390625,
+      "completions/mean_terminated_length": 7487.38134765625,
+      "completions/min_length": 1400.0,
+      "completions/min_terminated_length": 1400.0,
+      "entropy": 0.8946382254362106,
+      "epoch": 0.28518859245630174,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.001098336186259985,
+      "learning_rate": 1e-05,
+      "loss": 0.042,
+      "num_tokens": 252629300.0,
+      "reward": 0.3359375,
+      "reward_std": 0.27222445607185364,
+      "rewards/accuracy_reward/mean": 0.3359375,
+      "rewards/accuracy_reward/std": 0.47417303919792175,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000107288360596,
+      "sampling/importance_sampling_ratio/min": 0.00021643216314259917,
+      "sampling/sampling_logp_difference/max": 8.438233375549316,
+      "sampling/sampling_logp_difference/mean": 0.01972624473273754,
+      "step": 310
+    },
+    {
+      "clip_ratio/high_max": 6.5777783220255515e-06,
+      "clip_ratio/high_mean": 1.6444445805063879e-06,
+      "clip_ratio/low_mean": 1.7658890669736138e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 1.9303335250242526e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15763.0,
+      "completions/mean_length": 5796.984375,
+      "completions/mean_terminated_length": 5713.6220703125,
+      "completions/min_length": 528.0,
+      "completions/min_terminated_length": 528.0,
+      "entropy": 0.969724528491497,
+      "epoch": 0.2861085556577737,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.003871417138725519,
+      "learning_rate": 1e-05,
+      "loss": 0.0408,
+      "num_tokens": 253389562.0,
+      "reward": 0.484375,
+      "reward_std": 0.23752351105213165,
+      "rewards/accuracy_reward/mean": 0.484375,
+      "rewards/accuracy_reward/std": 0.5017194747924805,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998880624771118,
+      "sampling/importance_sampling_ratio/min": 2.4301782104885206e-05,
+      "sampling/sampling_logp_difference/max": 10.624960899353027,
+      "sampling/sampling_logp_difference/mean": 0.019220752641558647,
+      "step": 311
+    },
+    {
+      "clip_ratio/high_max": 8.099077376755304e-06,
+      "clip_ratio/high_mean": 2.8300572125772305e-06,
+      "clip_ratio/low_mean": 3.2033483023496956e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.486354006554393e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15769.0,
+      "completions/mean_length": 6938.5625,
+      "completions/mean_terminated_length": 6788.63525390625,
+      "completions/min_length": 769.0,
+      "completions/min_terminated_length": 769.0,
+      "entropy": 0.9812447279691696,
+      "epoch": 0.28702851885924563,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.002257548039779067,
+      "learning_rate": 1e-05,
+      "loss": -0.0089,
+      "num_tokens": 254295858.0,
+      "reward": 0.4140625,
+      "reward_std": 0.2596206068992615,
+      "rewards/accuracy_reward/mean": 0.4140625,
+      "rewards/accuracy_reward/std": 0.49449479579925537,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000464916229248,
+      "sampling/importance_sampling_ratio/min": 0.0009388317703269422,
+      "sampling/sampling_logp_difference/max": 6.970874309539795,
+      "sampling/sampling_logp_difference/mean": 0.02080199122428894,
+      "step": 312
+    },
+    {
+      "clip_ratio/high_max": 4.441917553776875e-06,
+      "clip_ratio/high_mean": 1.1104793884442188e-06,
+      "clip_ratio/low_mean": 3.414505465570983e-05,
+      "clip_ratio/low_min": 3.790060873143375e-06,
+      "clip_ratio/region_mean": 3.5255534044154047e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15163.0,
+      "completions/mean_length": 6878.15625,
+      "completions/mean_terminated_length": 6650.01611328125,
+      "completions/min_length": 302.0,
+      "completions/min_terminated_length": 302.0,
+      "entropy": 0.9106859937310219,
+      "epoch": 0.28794848206071755,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.00420041661709547,
+      "learning_rate": 1e-05,
+      "loss": 0.0179,
+      "num_tokens": 255197110.0,
+      "reward": 0.421875,
+      "reward_std": 0.30433881282806396,
+      "rewards/accuracy_reward/mean": 0.421875,
+      "rewards/accuracy_reward/std": 0.4957992732524872,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999925494194031,
+      "sampling/importance_sampling_ratio/min": 0.015217061154544353,
+      "sampling/sampling_logp_difference/max": 4.185338020324707,
+      "sampling/sampling_logp_difference/mean": 0.02016574889421463,
+      "step": 313
+    },
+    {
+      "clip_ratio/high_max": 8.814751254249131e-06,
+      "clip_ratio/high_mean": 2.203687813562283e-06,
+      "clip_ratio/low_mean": 3.137724206681014e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.3580929766685585e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 14462.0,
+      "completions/max_terminated_length": 14462.0,
+      "completions/mean_length": 6260.2578125,
+      "completions/mean_terminated_length": 6260.2578125,
+      "completions/min_length": 790.0,
+      "completions/min_terminated_length": 790.0,
+      "entropy": 0.9523455575108528,
+      "epoch": 0.2888684452621895,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0027907798066735268,
+      "learning_rate": 1e-05,
+      "loss": 0.0302,
+      "num_tokens": 256018935.0,
+      "reward": 0.421875,
+      "reward_std": 0.2659186124801636,
+      "rewards/accuracy_reward/mean": 0.421875,
+      "rewards/accuracy_reward/std": 0.4957992732524872,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000364780426025,
+      "sampling/importance_sampling_ratio/min": 7.485197420464829e-05,
+      "sampling/sampling_logp_difference/max": 9.499998092651367,
+      "sampling/sampling_logp_difference/mean": 0.0191945917904377,
+      "step": 314
+    },
+    {
+      "clip_ratio/high_max": 2.8685263259831117e-05,
+      "clip_ratio/high_mean": 7.171315814957779e-06,
+      "clip_ratio/low_mean": 2.780131131885355e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.497262770224552e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16124.0,
+      "completions/mean_length": 6202.828125,
+      "completions/mean_terminated_length": 6041.22265625,
+      "completions/min_length": 453.0,
+      "completions/min_terminated_length": 453.0,
+      "entropy": 0.8513326346874237,
+      "epoch": 0.28978840846366144,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0023744129575788975,
+      "learning_rate": 1e-05,
+      "loss": 0.0379,
+      "num_tokens": 256841129.0,
+      "reward": 0.5625,
+      "reward_std": 0.32407689094543457,
+      "rewards/accuracy_reward/mean": 0.5625,
+      "rewards/accuracy_reward/std": 0.49802759289741516,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000146627426147,
+      "sampling/importance_sampling_ratio/min": 9.269781003240496e-06,
+      "sampling/sampling_logp_difference/max": 11.588750839233398,
+      "sampling/sampling_logp_difference/mean": 0.019519174471497536,
+      "step": 315
+    },
+    {
+      "clip_ratio/high_max": 1.6381697605538648e-05,
+      "clip_ratio/high_mean": 4.095424401384662e-06,
+      "clip_ratio/low_mean": 3.0394592840821133e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.449001792432682e-05,
+      "completions/clipped_ratio": 0.1015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16328.0,
+      "completions/mean_length": 8019.4609375,
+      "completions/mean_terminated_length": 7073.90380859375,
+      "completions/min_length": 262.0,
+      "completions/min_terminated_length": 262.0,
+      "entropy": 0.9211000874638557,
+      "epoch": 0.2907083716651334,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0024705040268599987,
+      "learning_rate": 1e-05,
+      "loss": 0.0433,
+      "num_tokens": 257884188.0,
+      "reward": 0.3046875,
+      "reward_std": 0.2869499623775482,
+      "rewards/accuracy_reward/mean": 0.3046875,
+      "rewards/accuracy_reward/std": 0.46208351850509644,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999315738677979,
+      "sampling/importance_sampling_ratio/min": 0.016358470544219017,
+      "sampling/sampling_logp_difference/max": 4.113009452819824,
+      "sampling/sampling_logp_difference/mean": 0.01984308287501335,
+      "step": 316
+    },
+    {
+      "clip_ratio/high_max": 7.485402420570608e-06,
+      "clip_ratio/high_mean": 1.871350605142652e-06,
+      "clip_ratio/low_mean": 3.025547425750119e-05,
+      "clip_ratio/low_min": 2.697337095014518e-06,
+      "clip_ratio/region_mean": 3.212682509001752e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15210.0,
+      "completions/mean_length": 7257.6875,
+      "completions/mean_terminated_length": 7038.65625,
+      "completions/min_length": 248.0,
+      "completions/min_terminated_length": 248.0,
+      "entropy": 0.8801277950406075,
+      "epoch": 0.29162833486660533,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0032848953269422054,
+      "learning_rate": 1e-05,
+      "loss": 0.0305,
+      "num_tokens": 258831852.0,
+      "reward": 0.4296875,
+      "reward_std": 0.31010788679122925,
+      "rewards/accuracy_reward/mean": 0.4296875,
+      "rewards/accuracy_reward/std": 0.4969765841960907,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998986124992371,
+      "sampling/importance_sampling_ratio/min": 0.00019848966621793807,
+      "sampling/sampling_logp_difference/max": 8.524773597717285,
+      "sampling/sampling_logp_difference/mean": 0.019743187353014946,
+      "step": 317
+    },
+    {
+      "clip_ratio/high_max": 1.52771035573096e-05,
+      "clip_ratio/high_mean": 3.8192758893274e-06,
+      "clip_ratio/low_mean": 3.605492440783564e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.987420052453672e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14903.0,
+      "completions/mean_length": 6042.84375,
+      "completions/mean_terminated_length": 5878.69873046875,
+      "completions/min_length": 251.0,
+      "completions/min_terminated_length": 251.0,
+      "entropy": 0.8792382404208183,
+      "epoch": 0.29254829806807725,
+      "frac_reward_zero_std": 0.125,
+      "grad_norm": 0.004201764706522226,
+      "learning_rate": 1e-05,
+      "loss": 0.099,
+      "num_tokens": 259623512.0,
+      "reward": 0.640625,
+      "reward_std": 0.3913668990135193,
+      "rewards/accuracy_reward/mean": 0.640625,
+      "rewards/accuracy_reward/std": 0.481702595949173,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998612403869629,
+      "sampling/importance_sampling_ratio/min": 0.00027811730979010463,
+      "sampling/sampling_logp_difference/max": 8.187467575073242,
+      "sampling/sampling_logp_difference/mean": 0.018901977688074112,
+      "step": 318
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 4.1642084397608414e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.1642084397608414e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16381.0,
+      "completions/mean_length": 7667.6875,
+      "completions/mean_terminated_length": 7458.49658203125,
+      "completions/min_length": 4.0,
+      "completions/min_terminated_length": 4.0,
+      "entropy": 0.9096411988139153,
+      "epoch": 0.2934682612695492,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0014557713875547051,
+      "learning_rate": 1e-05,
+      "loss": 0.0383,
+      "num_tokens": 260623928.0,
+      "reward": 0.3515625,
+      "reward_std": 0.22726887464523315,
+      "rewards/accuracy_reward/mean": 0.3515625,
+      "rewards/accuracy_reward/std": 0.4793342351913452,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999264478683472,
+      "sampling/importance_sampling_ratio/min": 0.0002615761768538505,
+      "sampling/sampling_logp_difference/max": 8.248785018920898,
+      "sampling/sampling_logp_difference/mean": 0.01979639381170273,
+      "step": 319
+    },
+    {
+      "clip_ratio/high_max": 2.36019068324822e-05,
+      "clip_ratio/high_mean": 5.90047670812055e-06,
+      "clip_ratio/low_mean": 2.704614530557592e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.2946622809504333e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15514.0,
+      "completions/max_terminated_length": 15514.0,
+      "completions/mean_length": 6428.8203125,
+      "completions/mean_terminated_length": 6428.8203125,
+      "completions/min_length": 617.0,
+      "completions/min_terminated_length": 617.0,
+      "entropy": 0.9974069148302078,
+      "epoch": 0.29438822447102114,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0028210312593728304,
+      "learning_rate": 1e-05,
+      "loss": 0.0589,
+      "num_tokens": 261465625.0,
+      "reward": 0.46875,
+      "reward_std": 0.3169426918029785,
+      "rewards/accuracy_reward/mean": 0.46875,
+      "rewards/accuracy_reward/std": 0.5009832978248596,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000195503234863,
+      "sampling/importance_sampling_ratio/min": 0.001225265790708363,
+      "sampling/sampling_logp_difference/max": 6.704597473144531,
+      "sampling/sampling_logp_difference/mean": 0.021066997200250626,
+      "step": 320
+    },
+    {
+      "clip_ratio/high_max": 2.9634452857862925e-05,
+      "clip_ratio/high_mean": 7.408613214465731e-06,
+      "clip_ratio/low_mean": 3.7066520235384814e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.447513333616371e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15224.0,
+      "completions/mean_length": 5664.515625,
+      "completions/mean_terminated_length": 5580.1103515625,
+      "completions/min_length": 299.0,
+      "completions/min_terminated_length": 299.0,
+      "entropy": 0.9557281509041786,
+      "epoch": 0.2953081876724931,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0024263609666377306,
+      "learning_rate": 1e-05,
+      "loss": 0.0357,
+      "num_tokens": 262208475.0,
+      "reward": 0.4765625,
+      "reward_std": 0.26409637928009033,
+      "rewards/accuracy_reward/mean": 0.4765625,
+      "rewards/accuracy_reward/std": 0.5014128684997559,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998950958251953,
+      "sampling/importance_sampling_ratio/min": 0.0001059407222783193,
+      "sampling/sampling_logp_difference/max": 9.152630805969238,
+      "sampling/sampling_logp_difference/mean": 0.01997508481144905,
+      "step": 321
+    },
+    {
+      "clip_ratio/high_max": 1.9527269159880234e-05,
+      "clip_ratio/high_mean": 5.685056066795369e-06,
+      "clip_ratio/low_mean": 4.980480150607036e-05,
+      "clip_ratio/low_min": 5.136423624207964e-06,
+      "clip_ratio/region_mean": 5.5489856435997353e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15566.0,
+      "completions/mean_length": 6649.5390625,
+      "completions/mean_terminated_length": 6170.794921875,
+      "completions/min_length": 599.0,
+      "completions/min_terminated_length": 599.0,
+      "entropy": 0.9003193452954292,
+      "epoch": 0.29622815087396503,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0025556792970746756,
+      "learning_rate": 1e-05,
+      "loss": 0.0366,
+      "num_tokens": 263078672.0,
+      "reward": 0.453125,
+      "reward_std": 0.3214184641838074,
+      "rewards/accuracy_reward/mean": 0.453125,
+      "rewards/accuracy_reward/std": 0.4997538626194,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998431205749512,
+      "sampling/importance_sampling_ratio/min": 3.631301660789177e-05,
+      "sampling/sampling_logp_difference/max": 10.223334312438965,
+      "sampling/sampling_logp_difference/mean": 0.019613387063145638,
+      "step": 322
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 1.1492368912513484e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 1.1492368912513484e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15190.0,
+      "completions/mean_length": 5819.4140625,
+      "completions/mean_terminated_length": 5478.62060546875,
+      "completions/min_length": 701.0,
+      "completions/min_terminated_length": 701.0,
+      "entropy": 0.9234923645853996,
+      "epoch": 0.297148114075437,
+      "frac_reward_zero_std": 0.6875,
+      "grad_norm": 0.0008845282136462629,
+      "learning_rate": 1e-05,
+      "loss": 0.077,
+      "num_tokens": 263843797.0,
+      "reward": 0.5390625,
+      "reward_std": 0.14913026988506317,
+      "rewards/accuracy_reward/mean": 0.5390625,
+      "rewards/accuracy_reward/std": 0.5004304051399231,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999452233314514,
+      "sampling/importance_sampling_ratio/min": 0.06759586930274963,
+      "sampling/sampling_logp_difference/max": 2.6942083835601807,
+      "sampling/sampling_logp_difference/mean": 0.02007308602333069,
+      "step": 323
+    },
+    {
+      "clip_ratio/high_max": 1.1687909363899962e-05,
+      "clip_ratio/high_mean": 2.9219773409749905e-06,
+      "clip_ratio/low_mean": 2.420720869622528e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.7129186207730527e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16033.0,
+      "completions/mean_length": 6952.96875,
+      "completions/mean_terminated_length": 6726.62451171875,
+      "completions/min_length": 506.0,
+      "completions/min_terminated_length": 506.0,
+      "entropy": 0.8909401148557663,
+      "epoch": 0.2980680772769089,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.001527746208012104,
+      "learning_rate": 1e-05,
+      "loss": 0.0633,
+      "num_tokens": 264751769.0,
+      "reward": 0.453125,
+      "reward_std": 0.23410367965698242,
+      "rewards/accuracy_reward/mean": 0.453125,
+      "rewards/accuracy_reward/std": 0.4997538626194,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999520778656006,
+      "sampling/importance_sampling_ratio/min": 0.000505264790263027,
+      "sampling/sampling_logp_difference/max": 7.590427875518799,
+      "sampling/sampling_logp_difference/mean": 0.019622590392827988,
+      "step": 324
+    },
+    {
+      "clip_ratio/high_max": 1.5079081094881985e-05,
+      "clip_ratio/high_mean": 4.600909505825257e-06,
+      "clip_ratio/low_mean": 5.333864191925386e-05,
+      "clip_ratio/low_min": 5.043169494456379e-06,
+      "clip_ratio/region_mean": 5.793955187982647e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15667.0,
+      "completions/mean_length": 8138.5234375,
+      "completions/mean_terminated_length": 7733.0078125,
+      "completions/min_length": 268.0,
+      "completions/min_terminated_length": 268.0,
+      "entropy": 0.972789965569973,
+      "epoch": 0.29898804047838085,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.003113618353381753,
+      "learning_rate": 1e-05,
+      "loss": 0.0771,
+      "num_tokens": 265810580.0,
+      "reward": 0.40625,
+      "reward_std": 0.27328526973724365,
+      "rewards/accuracy_reward/mean": 0.40625,
+      "rewards/accuracy_reward/std": 0.4930621087551117,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998828172683716,
+      "sampling/importance_sampling_ratio/min": 9.312124404914357e-08,
+      "sampling/sampling_logp_difference/max": 16.189363479614258,
+      "sampling/sampling_logp_difference/mean": 0.02168515883386135,
+      "step": 325
+    },
+    {
+      "clip_ratio/high_max": 4.463807272259146e-06,
+      "clip_ratio/high_mean": 1.1159518180647865e-06,
+      "clip_ratio/low_mean": 3.45970811395091e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.571303295757389e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16248.0,
+      "completions/mean_length": 7565.6015625,
+      "completions/mean_terminated_length": 7131.90966796875,
+      "completions/min_length": 1017.0,
+      "completions/min_terminated_length": 1017.0,
+      "entropy": 0.835600845515728,
+      "epoch": 0.2999080036798528,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.0009589543915353715,
+      "learning_rate": 1e-05,
+      "loss": 0.0509,
+      "num_tokens": 266796097.0,
+      "reward": 0.5078125,
+      "reward_std": 0.16834920644760132,
+      "rewards/accuracy_reward/mean": 0.5078125,
+      "rewards/accuracy_reward/std": 0.5019033551216125,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999743700027466,
+      "sampling/importance_sampling_ratio/min": 0.0017039220547303557,
+      "sampling/sampling_logp_difference/max": 6.374822616577148,
+      "sampling/sampling_logp_difference/mean": 0.01885361596941948,
+      "step": 326
+    },
+    {
+      "clip_ratio/high_max": 2.260646033391822e-05,
+      "clip_ratio/high_mean": 5.651615083479555e-06,
+      "clip_ratio/low_mean": 5.806843591926736e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 6.372005145749426e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16167.0,
+      "completions/mean_length": 7124.0546875,
+      "completions/mean_terminated_length": 6668.64697265625,
+      "completions/min_length": 428.0,
+      "completions/min_terminated_length": 428.0,
+      "entropy": 0.9041655585169792,
+      "epoch": 0.30082796688132474,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0024741124361753464,
+      "learning_rate": 1e-05,
+      "loss": 0.0514,
+      "num_tokens": 267727528.0,
+      "reward": 0.4296875,
+      "reward_std": 0.23592591285705566,
+      "rewards/accuracy_reward/mean": 0.4296875,
+      "rewards/accuracy_reward/std": 0.4969765841960907,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999866247177124,
+      "sampling/importance_sampling_ratio/min": 4.63160322397016e-05,
+      "sampling/sampling_logp_difference/max": 9.980022430419922,
+      "sampling/sampling_logp_difference/mean": 0.01998118683695793,
+      "step": 327
+    },
+    {
+      "clip_ratio/high_max": 1.7461054540035548e-05,
+      "clip_ratio/high_mean": 5.456775966194982e-06,
+      "clip_ratio/low_mean": 3.374219397755951e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.919897017112817e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14834.0,
+      "completions/mean_length": 6757.65625,
+      "completions/mean_terminated_length": 6681.8583984375,
+      "completions/min_length": 1123.0,
+      "completions/min_terminated_length": 1123.0,
+      "entropy": 1.105302907526493,
+      "epoch": 0.3017479300827967,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.002233455190435052,
+      "learning_rate": 1e-05,
+      "loss": 0.0147,
+      "num_tokens": 268610868.0,
+      "reward": 0.375,
+      "reward_std": 0.23857943713665009,
+      "rewards/accuracy_reward/mean": 0.375,
+      "rewards/accuracy_reward/std": 0.4860251843929291,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999549984931946,
+      "sampling/importance_sampling_ratio/min": 3.3169128528243164e-06,
+      "sampling/sampling_logp_difference/max": 12.616476058959961,
+      "sampling/sampling_logp_difference/mean": 0.021600255742669106,
+      "step": 328
+    },
+    {
+      "clip_ratio/high_max": 1.7514204046165105e-05,
+      "clip_ratio/high_mean": 4.378551011541276e-06,
+      "clip_ratio/low_mean": 4.300070588669769e-05,
+      "clip_ratio/low_min": 3.6705330330732977e-06,
+      "clip_ratio/region_mean": 4.7379256784552126e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16147.0,
+      "completions/mean_length": 7522.5546875,
+      "completions/mean_terminated_length": 7381.8974609375,
+      "completions/min_length": 1390.0,
+      "completions/min_terminated_length": 1390.0,
+      "entropy": 1.0577925741672516,
+      "epoch": 0.30266789328426863,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0017964976141229272,
+      "learning_rate": 1e-05,
+      "loss": 0.0845,
+      "num_tokens": 269594867.0,
+      "reward": 0.421875,
+      "reward_std": 0.28223684430122375,
+      "rewards/accuracy_reward/mean": 0.421875,
+      "rewards/accuracy_reward/std": 0.4957992732524872,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999937891960144,
+      "sampling/importance_sampling_ratio/min": 0.002247168216854334,
+      "sampling/sampling_logp_difference/max": 6.098084449768066,
+      "sampling/sampling_logp_difference/mean": 0.021326296031475067,
+      "step": 329
+    },
+    {
+      "clip_ratio/high_max": 1.7011016097967513e-05,
+      "clip_ratio/high_mean": 4.252754024491878e-06,
+      "clip_ratio/low_mean": 2.5991578013417893e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.0244332265283447e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14462.0,
+      "completions/mean_length": 6232.109375,
+      "completions/mean_terminated_length": 5904.62890625,
+      "completions/min_length": 1238.0,
+      "completions/min_terminated_length": 1238.0,
+      "entropy": 0.8473618850111961,
+      "epoch": 0.30358785648574055,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0023369218688458204,
+      "learning_rate": 1e-05,
+      "loss": 0.0291,
+      "num_tokens": 270410785.0,
+      "reward": 0.6015625,
+      "reward_std": 0.23516449332237244,
+      "rewards/accuracy_reward/mean": 0.6015625,
+      "rewards/accuracy_reward/std": 0.4915000796318054,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.000063180923462,
+      "sampling/importance_sampling_ratio/min": 0.00010575528722256422,
+      "sampling/sampling_logp_difference/max": 9.154382705688477,
+      "sampling/sampling_logp_difference/mean": 0.018453873693943024,
+      "step": 330
+    },
+    {
+      "clip_ratio/high_max": 1.2072427125531249e-05,
+      "clip_ratio/high_mean": 4.300789669287042e-06,
+      "clip_ratio/low_mean": 3.064826853460545e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.4949058090205654e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 14454.0,
+      "completions/max_terminated_length": 14454.0,
+      "completions/mean_length": 5847.0625,
+      "completions/mean_terminated_length": 5847.0625,
+      "completions/min_length": 27.0,
+      "completions/min_terminated_length": 27.0,
+      "entropy": 0.8186105340719223,
+      "epoch": 0.3045078196872125,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0014558705734089017,
+      "learning_rate": 1e-05,
+      "loss": 0.0672,
+      "num_tokens": 271179113.0,
+      "reward": 0.5390625,
+      "reward_std": 0.22673210501670837,
+      "rewards/accuracy_reward/mean": 0.5390625,
+      "rewards/accuracy_reward/std": 0.5004304051399231,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.000067114830017,
+      "sampling/importance_sampling_ratio/min": 1.994453305087518e-05,
+      "sampling/sampling_logp_difference/max": 10.822555541992188,
+      "sampling/sampling_logp_difference/mean": 0.017629161477088928,
+      "step": 331
+    },
+    {
+      "clip_ratio/high_max": 3.204624090358266e-05,
+      "clip_ratio/high_mean": 8.719567063053546e-06,
+      "clip_ratio/low_mean": 5.131868192620459e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 6.0038249102944974e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16065.0,
+      "completions/mean_length": 6670.6015625,
+      "completions/mean_terminated_length": 6516.4208984375,
+      "completions/min_length": 24.0,
+      "completions/min_terminated_length": 24.0,
+      "entropy": 0.9379853457212448,
+      "epoch": 0.30542778288868444,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.002496426459401846,
+      "learning_rate": 1e-05,
+      "loss": 0.051,
+      "num_tokens": 272054510.0,
+      "reward": 0.328125,
+      "reward_std": 0.29932624101638794,
+      "rewards/accuracy_reward/mean": 0.328125,
+      "rewards/accuracy_reward/std": 0.4713755249977112,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998705387115479,
+      "sampling/importance_sampling_ratio/min": 0.00010894420120166615,
+      "sampling/sampling_logp_difference/max": 9.124674797058105,
+      "sampling/sampling_logp_difference/mean": 0.020175442099571228,
+      "step": 332
+    },
+    {
+      "clip_ratio/high_max": 1.1311959497106727e-05,
+      "clip_ratio/high_mean": 2.827989874276682e-06,
+      "clip_ratio/low_mean": 6.672416202491149e-05,
+      "clip_ratio/low_min": 4.344501576269977e-06,
+      "clip_ratio/region_mean": 6.955215212656185e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15716.0,
+      "completions/max_terminated_length": 15716.0,
+      "completions/mean_length": 6613.328125,
+      "completions/mean_terminated_length": 6613.328125,
+      "completions/min_length": 439.0,
+      "completions/min_terminated_length": 439.0,
+      "entropy": 1.0781218782067299,
+      "epoch": 0.3063477460901564,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0028466631192713976,
+      "learning_rate": 1e-05,
+      "loss": 0.0257,
+      "num_tokens": 272920304.0,
+      "reward": 0.3359375,
+      "reward_std": 0.32089439034461975,
+      "rewards/accuracy_reward/mean": 0.3359375,
+      "rewards/accuracy_reward/std": 0.47417303919792175,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999476671218872,
+      "sampling/importance_sampling_ratio/min": 0.02985518053174019,
+      "sampling/sampling_logp_difference/max": 3.511396884918213,
+      "sampling/sampling_logp_difference/mean": 0.02250460349023342,
+      "step": 333
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 2.3429964585375274e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.3429964585375274e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15572.0,
+      "completions/mean_length": 6632.78125,
+      "completions/mean_terminated_length": 6318.2255859375,
+      "completions/min_length": 888.0,
+      "completions/min_terminated_length": 888.0,
+      "entropy": 0.9595735669136047,
+      "epoch": 0.30726770929162833,
+      "frac_reward_zero_std": 0.6875,
+      "grad_norm": 0.0027409526519477367,
+      "learning_rate": 1e-05,
+      "loss": 0.0564,
+      "num_tokens": 273789588.0,
+      "reward": 0.3671875,
+      "reward_std": 0.12863078713417053,
+      "rewards/accuracy_reward/mean": 0.3671875,
+      "rewards/accuracy_reward/std": 0.4839322865009308,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999409914016724,
+      "sampling/importance_sampling_ratio/min": 8.484355930704623e-05,
+      "sampling/sampling_logp_difference/max": 9.374701499938965,
+      "sampling/sampling_logp_difference/mean": 0.02000725269317627,
+      "step": 334
+    },
+    {
+      "clip_ratio/high_max": 1.0485138318472309e-05,
+      "clip_ratio/high_mean": 2.6212845796180773e-06,
+      "clip_ratio/low_mean": 6.270217818382662e-05,
+      "clip_ratio/low_min": 1.282997527596308e-05,
+      "clip_ratio/region_mean": 6.532346287713153e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15567.0,
+      "completions/mean_length": 8083.421875,
+      "completions/mean_terminated_length": 7884.20849609375,
+      "completions/min_length": 631.0,
+      "completions/min_terminated_length": 631.0,
+      "entropy": 1.139024168252945,
+      "epoch": 0.30818767249310025,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.001853835303336382,
+      "learning_rate": 1e-05,
+      "loss": 0.0521,
+      "num_tokens": 274843754.0,
+      "reward": 0.2734375,
+      "reward_std": 0.29719969630241394,
+      "rewards/accuracy_reward/mean": 0.2734375,
+      "rewards/accuracy_reward/std": 0.447474867105484,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999961256980896,
+      "sampling/importance_sampling_ratio/min": 6.099340225773631e-06,
+      "sampling/sampling_logp_difference/max": 12.007329940795898,
+      "sampling/sampling_logp_difference/mean": 0.023757295683026314,
+      "step": 335
+    },
+    {
+      "clip_ratio/high_max": 6.558237146236934e-06,
+      "clip_ratio/high_mean": 1.6395592865592334e-06,
+      "clip_ratio/low_mean": 3.2649955073793535e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.428951481510012e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 16058.0,
+      "completions/max_terminated_length": 16058.0,
+      "completions/mean_length": 6932.6640625,
+      "completions/mean_terminated_length": 6932.6640625,
+      "completions/min_length": 666.0,
+      "completions/min_terminated_length": 666.0,
+      "entropy": 1.2969390451908112,
+      "epoch": 0.3091076356945722,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.002049664966762066,
+      "learning_rate": 1e-05,
+      "loss": 0.0179,
+      "num_tokens": 275750023.0,
+      "reward": 0.21875,
+      "reward_std": 0.23934084177017212,
+      "rewards/accuracy_reward/mean": 0.21875,
+      "rewards/accuracy_reward/std": 0.41502299904823303,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000157356262207,
+      "sampling/importance_sampling_ratio/min": 5.287989188218489e-05,
+      "sampling/sampling_logp_difference/max": 9.847487449645996,
+      "sampling/sampling_logp_difference/mean": 0.021840902045369148,
+      "step": 336
+    },
+    {
+      "clip_ratio/high_max": 5.1826359594997484e-06,
+      "clip_ratio/high_mean": 1.2956589898749371e-06,
+      "clip_ratio/low_mean": 3.607215444390022e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.736781377483567e-05,
+      "completions/clipped_ratio": 0.0546875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15213.0,
+      "completions/mean_length": 7630.65625,
+      "completions/mean_terminated_length": 7124.26416015625,
+      "completions/min_length": 1002.0,
+      "completions/min_terminated_length": 1002.0,
+      "entropy": 0.959126852452755,
+      "epoch": 0.31002759889604414,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0030745298136025667,
+      "learning_rate": 1e-05,
+      "loss": 0.0487,
+      "num_tokens": 276750011.0,
+      "reward": 0.3125,
+      "reward_std": 0.30091896653175354,
+      "rewards/accuracy_reward/mean": 0.3125,
+      "rewards/accuracy_reward/std": 0.4653336703777313,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999728798866272,
+      "sampling/importance_sampling_ratio/min": 3.149233089061454e-05,
+      "sampling/sampling_logp_difference/max": 10.365766525268555,
+      "sampling/sampling_logp_difference/mean": 0.021394159644842148,
+      "step": 337
+    },
+    {
+      "clip_ratio/high_max": 6.921764679646003e-06,
+      "clip_ratio/high_mean": 2.5604765028219845e-06,
+      "clip_ratio/low_mean": 2.64957521380893e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.905622847038103e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15907.0,
+      "completions/mean_length": 7383.2421875,
+      "completions/mean_terminated_length": 7240.37353515625,
+      "completions/min_length": 432.0,
+      "completions/min_terminated_length": 432.0,
+      "entropy": 1.1512386053800583,
+      "epoch": 0.3109475620975161,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0014476332580670714,
+      "learning_rate": 1e-05,
+      "loss": 0.0686,
+      "num_tokens": 277715450.0,
+      "reward": 0.4140625,
+      "reward_std": 0.2477683424949646,
+      "rewards/accuracy_reward/mean": 0.4140625,
+      "rewards/accuracy_reward/std": 0.49449479579925537,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999750256538391,
+      "sampling/importance_sampling_ratio/min": 4.5251621486386284e-05,
+      "sampling/sampling_logp_difference/max": 10.00327205657959,
+      "sampling/sampling_logp_difference/mean": 0.020672230049967766,
+      "step": 338
+    },
+    {
+      "clip_ratio/high_max": 3.7021679872850655e-06,
+      "clip_ratio/high_mean": 9.255419968212664e-07,
+      "clip_ratio/low_mean": 3.8645826748506806e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.957136880217149e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14693.0,
+      "completions/mean_length": 5876.421875,
+      "completions/mean_terminated_length": 5793.68505859375,
+      "completions/min_length": 116.0,
+      "completions/min_terminated_length": 116.0,
+      "entropy": 1.0786077454686165,
+      "epoch": 0.31186752529898804,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.0018895689863711596,
+      "learning_rate": 1e-05,
+      "loss": 0.0067,
+      "num_tokens": 278491688.0,
+      "reward": 0.3984375,
+      "reward_std": 0.21146979928016663,
+      "rewards/accuracy_reward/mean": 0.3984375,
+      "rewards/accuracy_reward/std": 0.4915000796318054,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998824596405029,
+      "sampling/importance_sampling_ratio/min": 0.0007111100130714476,
+      "sampling/sampling_logp_difference/max": 7.248683452606201,
+      "sampling/sampling_logp_difference/mean": 0.020282316952943802,
+      "step": 339
+    },
+    {
+      "clip_ratio/high_max": 1.8740533050731756e-05,
+      "clip_ratio/high_mean": 4.685133262682939e-06,
+      "clip_ratio/low_mean": 2.9699310402975243e-05,
+      "clip_ratio/low_min": 4.435140454006614e-06,
+      "clip_ratio/region_mean": 3.4384443438284507e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14627.0,
+      "completions/mean_length": 7162.625,
+      "completions/mean_terminated_length": 6709.1142578125,
+      "completions/min_length": 986.0,
+      "completions/min_terminated_length": 986.0,
+      "entropy": 0.898807168006897,
+      "epoch": 0.31278748850046,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.002133915899321437,
+      "learning_rate": 1e-05,
+      "loss": 0.0222,
+      "num_tokens": 279427384.0,
+      "reward": 0.4453125,
+      "reward_std": 0.32142335176467896,
+      "rewards/accuracy_reward/mean": 0.4453125,
+      "rewards/accuracy_reward/std": 0.4989531338214874,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000159740447998,
+      "sampling/importance_sampling_ratio/min": 0.004845126066356897,
+      "sampling/sampling_logp_difference/max": 5.329782009124756,
+      "sampling/sampling_logp_difference/mean": 0.019643021747469902,
+      "step": 340
+    },
+    {
+      "clip_ratio/high_max": 1.472241683586617e-05,
+      "clip_ratio/high_mean": 5.561973125622899e-06,
+      "clip_ratio/low_mean": 6.452910844245707e-05,
+      "clip_ratio/low_min": 9.302988473791629e-06,
+      "clip_ratio/region_mean": 7.009108327338254e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15965.0,
+      "completions/mean_length": 7072.3828125,
+      "completions/mean_terminated_length": 6999.06298828125,
+      "completions/min_length": 93.0,
+      "completions/min_terminated_length": 93.0,
+      "entropy": 0.8942967653274536,
+      "epoch": 0.3137074517019319,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0023624920286238194,
+      "learning_rate": 1e-05,
+      "loss": 0.0866,
+      "num_tokens": 280352177.0,
+      "reward": 0.375,
+      "reward_std": 0.36637401580810547,
+      "rewards/accuracy_reward/mean": 0.375,
+      "rewards/accuracy_reward/std": 0.4860251843929291,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999604225158691,
+      "sampling/importance_sampling_ratio/min": 0.0008250995306298137,
+      "sampling/sampling_logp_difference/max": 7.100006580352783,
+      "sampling/sampling_logp_difference/mean": 0.020037520676851273,
+      "step": 341
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 4.717265596809739e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.717265596809739e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 16300.0,
+      "completions/max_terminated_length": 16300.0,
+      "completions/mean_length": 6553.203125,
+      "completions/mean_terminated_length": 6553.203125,
+      "completions/min_length": 664.0,
+      "completions/min_terminated_length": 664.0,
+      "entropy": 0.8765531405806541,
+      "epoch": 0.31462741490340385,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0025228122249245644,
+      "learning_rate": 1e-05,
+      "loss": 0.0539,
+      "num_tokens": 281208411.0,
+      "reward": 0.40625,
+      "reward_std": 0.3390446603298187,
+      "rewards/accuracy_reward/mean": 0.40625,
+      "rewards/accuracy_reward/std": 0.4930621087551117,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999656677246094,
+      "sampling/importance_sampling_ratio/min": 0.00030091358348727226,
+      "sampling/sampling_logp_difference/max": 8.108687400817871,
+      "sampling/sampling_logp_difference/mean": 0.018958289176225662,
+      "step": 342
+    },
+    {
+      "clip_ratio/high_max": 1.5562100998067763e-05,
+      "clip_ratio/high_mean": 3.890525249516941e-06,
+      "clip_ratio/low_mean": 6.593948137378902e-05,
+      "clip_ratio/low_min": 1.4238520634535234e-05,
+      "clip_ratio/region_mean": 6.983000685067964e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14916.0,
+      "completions/mean_length": 6489.40625,
+      "completions/mean_terminated_length": 6087.1865234375,
+      "completions/min_length": 374.0,
+      "completions/min_terminated_length": 374.0,
+      "entropy": 0.8384068235754967,
+      "epoch": 0.3155473781048758,
+      "frac_reward_zero_std": 0.125,
+      "grad_norm": 0.003243578365072608,
+      "learning_rate": 1e-05,
+      "loss": 0.119,
+      "num_tokens": 282059863.0,
+      "reward": 0.515625,
+      "reward_std": 0.39689862728118896,
+      "rewards/accuracy_reward/mean": 0.515625,
+      "rewards/accuracy_reward/std": 0.5017194747924805,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999314546585083,
+      "sampling/importance_sampling_ratio/min": 0.00026549631729722023,
+      "sampling/sampling_logp_difference/max": 8.233909606933594,
+      "sampling/sampling_logp_difference/mean": 0.01820875145494938,
+      "step": 343
+    },
+    {
+      "clip_ratio/high_max": 4.114007424504962e-06,
+      "clip_ratio/high_mean": 1.0285018561262405e-06,
+      "clip_ratio/low_mean": 3.0735714062757324e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.176421569150989e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15859.0,
+      "completions/max_terminated_length": 15859.0,
+      "completions/mean_length": 7148.7890625,
+      "completions/mean_terminated_length": 7148.7890625,
+      "completions/min_length": 252.0,
+      "completions/min_terminated_length": 252.0,
+      "entropy": 1.0214989855885506,
+      "epoch": 0.31646734130634774,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0027867467142641544,
+      "learning_rate": 1e-05,
+      "loss": 0.0445,
+      "num_tokens": 282994036.0,
+      "reward": 0.4921875,
+      "reward_std": 0.28511500358581543,
+      "rewards/accuracy_reward/mean": 0.4921875,
+      "rewards/accuracy_reward/std": 0.5019033551216125,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999145269393921,
+      "sampling/importance_sampling_ratio/min": 0.027774186804890633,
+      "sampling/sampling_logp_difference/max": 3.583648204803467,
+      "sampling/sampling_logp_difference/mean": 0.0217401385307312,
+      "step": 344
+    },
+    {
+      "clip_ratio/high_max": 1.6063933799159713e-05,
+      "clip_ratio/high_mean": 5.513276278179546e-06,
+      "clip_ratio/low_mean": 4.230772367463942e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.782100086231367e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 16091.0,
+      "completions/max_terminated_length": 16091.0,
+      "completions/mean_length": 5532.1328125,
+      "completions/mean_terminated_length": 5532.1328125,
+      "completions/min_length": 467.0,
+      "completions/min_terminated_length": 467.0,
+      "entropy": 0.9303388148546219,
+      "epoch": 0.3173873045078197,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.0024432060308754444,
+      "learning_rate": 1e-05,
+      "loss": 0.0251,
+      "num_tokens": 283723605.0,
+      "reward": 0.421875,
+      "reward_std": 0.38717782497406006,
+      "rewards/accuracy_reward/mean": 0.421875,
+      "rewards/accuracy_reward/std": 0.4957992732524872,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999819993972778,
+      "sampling/importance_sampling_ratio/min": 0.011936242692172527,
+      "sampling/sampling_logp_difference/max": 4.428175926208496,
+      "sampling/sampling_logp_difference/mean": 0.019281461834907532,
+      "step": 345
+    },
+    {
+      "clip_ratio/high_max": 6.218693215487292e-06,
+      "clip_ratio/high_mean": 1.554673303871823e-06,
+      "clip_ratio/low_mean": 1.5384349637770356e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 1.6939022600581666e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15933.0,
+      "completions/mean_length": 6830.09375,
+      "completions/mean_terminated_length": 6441.72314453125,
+      "completions/min_length": 503.0,
+      "completions/min_terminated_length": 503.0,
+      "entropy": 0.9551377296447754,
+      "epoch": 0.31830726770929163,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.0031446516513824463,
+      "learning_rate": 1e-05,
+      "loss": -0.0037,
+      "num_tokens": 284617089.0,
+      "reward": 0.3671875,
+      "reward_std": 0.20911568403244019,
+      "rewards/accuracy_reward/mean": 0.3671875,
+      "rewards/accuracy_reward/std": 0.4839322865009308,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999873042106628,
+      "sampling/importance_sampling_ratio/min": 0.0007485119276680052,
+      "sampling/sampling_logp_difference/max": 7.197423458099365,
+      "sampling/sampling_logp_difference/mean": 0.01985902711749077,
+      "step": 346
+    },
+    {
+      "clip_ratio/high_max": 7.772906428726856e-06,
+      "clip_ratio/high_mean": 2.8712697712762747e-06,
+      "clip_ratio/low_mean": 3.287052913947264e-05,
+      "clip_ratio/low_min": 2.789369091260596e-06,
+      "clip_ratio/region_mean": 3.574179936549626e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15613.0,
+      "completions/mean_length": 6557.3515625,
+      "completions/mean_terminated_length": 6401.37353515625,
+      "completions/min_length": 385.0,
+      "completions/min_terminated_length": 385.0,
+      "entropy": 1.0254710763692856,
+      "epoch": 0.31922723091076355,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0024617225863039494,
+      "learning_rate": 1e-05,
+      "loss": 0.0669,
+      "num_tokens": 285475910.0,
+      "reward": 0.390625,
+      "reward_std": 0.2761683464050293,
+      "rewards/accuracy_reward/mean": 0.390625,
+      "rewards/accuracy_reward/std": 0.4898075461387634,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999537467956543,
+      "sampling/importance_sampling_ratio/min": 0.006225659977644682,
+      "sampling/sampling_logp_difference/max": 5.079075813293457,
+      "sampling/sampling_logp_difference/mean": 0.021138068288564682,
+      "step": 347
+    },
+    {
+      "clip_ratio/high_max": 1.0258745533064939e-05,
+      "clip_ratio/high_mean": 3.588538106669148e-06,
+      "clip_ratio/low_mean": 6.333507008093875e-05,
+      "clip_ratio/low_min": 4.415712737682043e-06,
+      "clip_ratio/region_mean": 6.692360875604209e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15851.0,
+      "completions/mean_length": 7379.140625,
+      "completions/mean_terminated_length": 7088.6611328125,
+      "completions/min_length": 1243.0,
+      "completions/min_terminated_length": 1243.0,
+      "entropy": 0.9518962875008583,
+      "epoch": 0.3201471941122355,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0017496495274826884,
+      "learning_rate": 1e-05,
+      "loss": 0.0734,
+      "num_tokens": 286439696.0,
+      "reward": 0.390625,
+      "reward_std": 0.26538965106010437,
+      "rewards/accuracy_reward/mean": 0.390625,
+      "rewards/accuracy_reward/std": 0.4898075461387634,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999358654022217,
+      "sampling/importance_sampling_ratio/min": 0.006735759321600199,
+      "sampling/sampling_logp_difference/max": 5.000324726104736,
+      "sampling/sampling_logp_difference/mean": 0.021384600549936295,
+      "step": 348
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 2.854056094747648e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.854056094747648e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16123.0,
+      "completions/mean_length": 5960.140625,
+      "completions/mean_terminated_length": 5878.06298828125,
+      "completions/min_length": 833.0,
+      "completions/min_terminated_length": 833.0,
+      "entropy": 0.9556702002882957,
+      "epoch": 0.32106715731370744,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0013999518705531955,
+      "learning_rate": 1e-05,
+      "loss": 0.0484,
+      "num_tokens": 287226394.0,
+      "reward": 0.3515625,
+      "reward_std": 0.20175683498382568,
+      "rewards/accuracy_reward/mean": 0.3515625,
+      "rewards/accuracy_reward/std": 0.4793342351913452,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999549984931946,
+      "sampling/importance_sampling_ratio/min": 8.140038517012727e-06,
+      "sampling/sampling_logp_difference/max": 11.71871566772461,
+      "sampling/sampling_logp_difference/mean": 0.01937047764658928,
+      "step": 349
+    },
+    {
+      "clip_ratio/high_max": 8.395007171202451e-06,
+      "clip_ratio/high_mean": 2.0987517928006127e-06,
+      "clip_ratio/low_mean": 3.610323426528339e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.820198628545768e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 12561.0,
+      "completions/mean_length": 5387.546875,
+      "completions/mean_terminated_length": 5300.96044921875,
+      "completions/min_length": 464.0,
+      "completions/min_terminated_length": 464.0,
+      "entropy": 0.95712860673666,
+      "epoch": 0.3219871205151794,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.004228786565363407,
+      "learning_rate": 1e-05,
+      "loss": 0.0692,
+      "num_tokens": 287935952.0,
+      "reward": 0.5234375,
+      "reward_std": 0.29378965497016907,
+      "rewards/accuracy_reward/mean": 0.5234375,
+      "rewards/accuracy_reward/std": 0.5014128684997559,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000360012054443,
+      "sampling/importance_sampling_ratio/min": 0.005966294556856155,
+      "sampling/sampling_logp_difference/max": 5.121629238128662,
+      "sampling/sampling_logp_difference/mean": 0.020441649481654167,
+      "step": 350
+    },
+    {
+      "clip_ratio/high_max": 1.2559637070808094e-05,
+      "clip_ratio/high_mean": 3.1399092677020235e-06,
+      "clip_ratio/low_mean": 2.673440690159623e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.9874316624045605e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15957.0,
+      "completions/mean_length": 5799.625,
+      "completions/mean_terminated_length": 5716.283203125,
+      "completions/min_length": 415.0,
+      "completions/min_terminated_length": 415.0,
+      "entropy": 0.9457403644919395,
+      "epoch": 0.32290708371665133,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.0029834613669663668,
+      "learning_rate": 1e-05,
+      "loss": 0.0589,
+      "num_tokens": 288696000.0,
+      "reward": 0.4921875,
+      "reward_std": 0.3884710967540741,
+      "rewards/accuracy_reward/mean": 0.4921875,
+      "rewards/accuracy_reward/std": 0.5019033551216125,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999950528144836,
+      "sampling/importance_sampling_ratio/min": 0.0011352180736139417,
+      "sampling/sampling_logp_difference/max": 6.780930519104004,
+      "sampling/sampling_logp_difference/mean": 0.021189026534557343,
+      "step": 351
+    },
+    {
+      "clip_ratio/high_max": 6.2518756749341264e-06,
+      "clip_ratio/high_mean": 1.5629689187335316e-06,
+      "clip_ratio/low_mean": 3.849920358334202e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.0062172047328204e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16148.0,
+      "completions/mean_length": 7058.6875,
+      "completions/mean_terminated_length": 6757.87060546875,
+      "completions/min_length": 799.0,
+      "completions/min_terminated_length": 799.0,
+      "entropy": 0.8782663866877556,
+      "epoch": 0.32382704691812325,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.002617151942104101,
+      "learning_rate": 1e-05,
+      "loss": 0.0874,
+      "num_tokens": 289618904.0,
+      "reward": 0.3515625,
+      "reward_std": 0.28353992104530334,
+      "rewards/accuracy_reward/mean": 0.3515625,
+      "rewards/accuracy_reward/std": 0.4793342351913452,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999942779541016,
+      "sampling/importance_sampling_ratio/min": 0.001438659499399364,
+      "sampling/sampling_logp_difference/max": 6.54404354095459,
+      "sampling/sampling_logp_difference/mean": 0.019699860364198685,
+      "step": 352
+    },
+    {
+      "clip_ratio/high_max": 1.8079134861181956e-05,
+      "clip_ratio/high_mean": 4.519783715295489e-06,
+      "clip_ratio/low_mean": 6.639697721766424e-05,
+      "clip_ratio/low_min": 1.0295151696482208e-05,
+      "clip_ratio/region_mean": 7.091676206982811e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15519.0,
+      "completions/mean_length": 6609.953125,
+      "completions/mean_terminated_length": 6454.81005859375,
+      "completions/min_length": 500.0,
+      "completions/min_terminated_length": 500.0,
+      "entropy": 0.8895087689161301,
+      "epoch": 0.3247470101195952,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.0021503251045942307,
+      "learning_rate": 1e-05,
+      "loss": 0.044,
+      "num_tokens": 290484378.0,
+      "reward": 0.3671875,
+      "reward_std": 0.35324612259864807,
+      "rewards/accuracy_reward/mean": 0.3671875,
+      "rewards/accuracy_reward/std": 0.4839322865009308,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999619722366333,
+      "sampling/importance_sampling_ratio/min": 5.448641240946017e-05,
+      "sampling/sampling_logp_difference/max": 9.817559242248535,
+      "sampling/sampling_logp_difference/mean": 0.0200796015560627,
+      "step": 353
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 6.141278026916552e-05,
+      "clip_ratio/low_min": 1.333249815616e-05,
+      "clip_ratio/region_mean": 6.141278026916552e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16284.0,
+      "completions/mean_length": 7872.4921875,
+      "completions/mean_terminated_length": 7453.89306640625,
+      "completions/min_length": 328.0,
+      "completions/min_terminated_length": 328.0,
+      "entropy": 0.9183534607291222,
+      "epoch": 0.32566697332106714,
+      "frac_reward_zero_std": 0.125,
+      "grad_norm": 0.0023925534915179014,
+      "learning_rate": 1e-05,
+      "loss": 0.0895,
+      "num_tokens": 291512393.0,
+      "reward": 0.34375,
+      "reward_std": 0.3763991594314575,
+      "rewards/accuracy_reward/mean": 0.34375,
+      "rewards/accuracy_reward/std": 0.47682511806488037,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999232292175293,
+      "sampling/importance_sampling_ratio/min": 0.0004287353658583015,
+      "sampling/sampling_logp_difference/max": 7.7546706199646,
+      "sampling/sampling_logp_difference/mean": 0.020358648151159286,
+      "step": 354
+    },
+    {
+      "clip_ratio/high_max": 1.0912609013757901e-05,
+      "clip_ratio/high_mean": 3.7178592720010784e-06,
+      "clip_ratio/low_mean": 1.995230707052542e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.367016588777915e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15485.0,
+      "completions/mean_length": 6605.6640625,
+      "completions/mean_terminated_length": 6290.23388671875,
+      "completions/min_length": 269.0,
+      "completions/min_terminated_length": 269.0,
+      "entropy": 0.9602678120136261,
+      "epoch": 0.3265869365225391,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0018709113355726004,
+      "learning_rate": 1e-05,
+      "loss": 0.0642,
+      "num_tokens": 292380390.0,
+      "reward": 0.515625,
+      "reward_std": 0.26303553581237793,
+      "rewards/accuracy_reward/mean": 0.515625,
+      "rewards/accuracy_reward/std": 0.5017194747924805,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999732375144958,
+      "sampling/importance_sampling_ratio/min": 6.221406168016586e-10,
+      "sampling/sampling_logp_difference/max": 21.19785499572754,
+      "sampling/sampling_logp_difference/mean": 0.02150166593492031,
+      "step": 355
+    },
+    {
+      "clip_ratio/high_max": 2.202200403189636e-05,
+      "clip_ratio/high_mean": 6.279054105107207e-06,
+      "clip_ratio/low_mean": 5.168271604816255e-05,
+      "clip_ratio/low_min": 7.731559890089557e-06,
+      "clip_ratio/region_mean": 5.796177038064343e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 13477.0,
+      "completions/max_terminated_length": 13477.0,
+      "completions/mean_length": 6677.8828125,
+      "completions/mean_terminated_length": 6677.8828125,
+      "completions/min_length": 754.0,
+      "completions/min_terminated_length": 754.0,
+      "entropy": 1.001693107187748,
+      "epoch": 0.32750689972401104,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0017649955116212368,
+      "learning_rate": 1e-05,
+      "loss": 0.0502,
+      "num_tokens": 293255287.0,
+      "reward": 0.3203125,
+      "reward_std": 0.2845958471298218,
+      "rewards/accuracy_reward/mean": 0.3203125,
+      "rewards/accuracy_reward/std": 0.4684300124645233,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998878240585327,
+      "sampling/importance_sampling_ratio/min": 0.0027159738820046186,
+      "sampling/sampling_logp_difference/max": 5.908604621887207,
+      "sampling/sampling_logp_difference/mean": 0.020375655964016914,
+      "step": 356
+    },
+    {
+      "clip_ratio/high_max": 5.7686097534315195e-06,
+      "clip_ratio/high_mean": 2.223324372607749e-06,
+      "clip_ratio/low_mean": 2.7612236522145395e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.9835560894753144e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15925.0,
+      "completions/mean_length": 6210.6953125,
+      "completions/mean_terminated_length": 6049.21484375,
+      "completions/min_length": 870.0,
+      "completions/min_terminated_length": 870.0,
+      "entropy": 0.9842480793595314,
+      "epoch": 0.32842686292548295,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0024816791992634535,
+      "learning_rate": 1e-05,
+      "loss": 0.0637,
+      "num_tokens": 294069184.0,
+      "reward": 0.4140625,
+      "reward_std": 0.2845958471298218,
+      "rewards/accuracy_reward/mean": 0.4140625,
+      "rewards/accuracy_reward/std": 0.49449479579925537,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000953674316406,
+      "sampling/importance_sampling_ratio/min": 0.0047831060364842415,
+      "sampling/sampling_logp_difference/max": 5.342665195465088,
+      "sampling/sampling_logp_difference/mean": 0.021009165793657303,
+      "step": 357
+    },
+    {
+      "clip_ratio/high_max": 5.0844009820139036e-06,
+      "clip_ratio/high_mean": 1.2711002455034759e-06,
+      "clip_ratio/low_mean": 4.299241186345171e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.426351074471313e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16085.0,
+      "completions/mean_length": 6876.0546875,
+      "completions/mean_terminated_length": 6725.13525390625,
+      "completions/min_length": 206.0,
+      "completions/min_terminated_length": 206.0,
+      "entropy": 0.8680268228054047,
+      "epoch": 0.32934682612695493,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0030787813011556864,
+      "learning_rate": 1e-05,
+      "loss": 0.1096,
+      "num_tokens": 294969111.0,
+      "reward": 0.4921875,
+      "reward_std": 0.3514111638069153,
+      "rewards/accuracy_reward/mean": 0.4921875,
+      "rewards/accuracy_reward/std": 0.5019033551216125,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999387264251709,
+      "sampling/importance_sampling_ratio/min": 0.0036591701209545135,
+      "sampling/sampling_logp_difference/max": 5.610518932342529,
+      "sampling/sampling_logp_difference/mean": 0.019419874995946884,
+      "step": 358
+    },
+    {
+      "clip_ratio/high_max": 5.279830929794116e-06,
+      "clip_ratio/high_mean": 1.319957732448529e-06,
+      "clip_ratio/low_mean": 3.3445195754211454e-05,
+      "clip_ratio/low_min": 3.1955414669937454e-06,
+      "clip_ratio/region_mean": 3.476515314559947e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16154.0,
+      "completions/mean_length": 7079.7734375,
+      "completions/mean_terminated_length": 6932.087890625,
+      "completions/min_length": 973.0,
+      "completions/min_terminated_length": 973.0,
+      "entropy": 1.0033101588487625,
+      "epoch": 0.33026678932842685,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.0027940638829022646,
+      "learning_rate": 1e-05,
+      "loss": 0.1352,
+      "num_tokens": 295894682.0,
+      "reward": 0.4140625,
+      "reward_std": 0.40319663286209106,
+      "rewards/accuracy_reward/mean": 0.4140625,
+      "rewards/accuracy_reward/std": 0.49449479579925537,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999890923500061,
+      "sampling/importance_sampling_ratio/min": 0.00033553718822076917,
+      "sampling/sampling_logp_difference/max": 7.999777793884277,
+      "sampling/sampling_logp_difference/mean": 0.021608728915452957,
+      "step": 359
+    },
+    {
+      "clip_ratio/high_max": 4.0542295209888835e-06,
+      "clip_ratio/high_mean": 1.0135573802472209e-06,
+      "clip_ratio/low_mean": 3.935158406420669e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.0365141785514425e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14374.0,
+      "completions/mean_length": 6487.421875,
+      "completions/mean_terminated_length": 6249.904296875,
+      "completions/min_length": 637.0,
+      "completions/min_terminated_length": 637.0,
+      "entropy": 0.9404204189777374,
+      "epoch": 0.3311867525298988,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0021709369029849768,
+      "learning_rate": 1e-05,
+      "loss": 0.0479,
+      "num_tokens": 296744216.0,
+      "reward": 0.4296875,
+      "reward_std": 0.31800350546836853,
+      "rewards/accuracy_reward/mean": 0.4296875,
+      "rewards/accuracy_reward/std": 0.4969765841960907,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000174045562744,
+      "sampling/importance_sampling_ratio/min": 0.00012341697583906353,
+      "sampling/sampling_logp_difference/max": 8.9999418258667,
+      "sampling/sampling_logp_difference/mean": 0.02024281956255436,
+      "step": 360
+    },
+    {
+      "clip_ratio/high_max": 2.4414162908215076e-05,
+      "clip_ratio/high_mean": 6.103540727053769e-06,
+      "clip_ratio/low_mean": 2.0490186102506414e-05,
+      "clip_ratio/low_min": 2.8498473056970397e-06,
+      "clip_ratio/region_mean": 2.6593726602186507e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14584.0,
+      "completions/mean_length": 6668.1953125,
+      "completions/mean_terminated_length": 6273.24365234375,
+      "completions/min_length": 567.0,
+      "completions/min_terminated_length": 567.0,
+      "entropy": 0.8671490699052811,
+      "epoch": 0.33210671573137074,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0018110686214640737,
+      "learning_rate": 1e-05,
+      "loss": -0.0018,
+      "num_tokens": 297617937.0,
+      "reward": 0.4765625,
+      "reward_std": 0.22673210501670837,
+      "rewards/accuracy_reward/mean": 0.4765625,
+      "rewards/accuracy_reward/std": 0.5014128684997559,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999983549118042,
+      "sampling/importance_sampling_ratio/min": 0.0003801324055530131,
+      "sampling/sampling_logp_difference/max": 7.874990940093994,
+      "sampling/sampling_logp_difference/mean": 0.01934785582125187,
+      "step": 361
+    },
+    {
+      "clip_ratio/high_max": 8.66071218297293e-06,
+      "clip_ratio/high_mean": 2.1651780457432324e-06,
+      "clip_ratio/low_mean": 2.4539695857583865e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.6704873903327098e-05,
+      "completions/clipped_ratio": 0.0703125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15979.0,
+      "completions/mean_length": 8579.9921875,
+      "completions/mean_terminated_length": 7989.7734375,
+      "completions/min_length": 363.0,
+      "completions/min_terminated_length": 363.0,
+      "entropy": 1.0337364450097084,
+      "epoch": 0.3330266789328427,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.0014365602983161807,
+      "learning_rate": 1e-05,
+      "loss": 0.045,
+      "num_tokens": 298736304.0,
+      "reward": 0.1953125,
+      "reward_std": 0.1999218761920929,
+      "rewards/accuracy_reward/mean": 0.1953125,
+      "rewards/accuracy_reward/std": 0.3979988098144531,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999146461486816,
+      "sampling/importance_sampling_ratio/min": 0.0014037116197869182,
+      "sampling/sampling_logp_difference/max": 6.5686354637146,
+      "sampling/sampling_logp_difference/mean": 0.021067796275019646,
+      "step": 362
+    },
+    {
+      "clip_ratio/high_max": 7.748803682261496e-06,
+      "clip_ratio/high_mean": 1.937200920565374e-06,
+      "clip_ratio/low_mean": 5.063434127805522e-05,
+      "clip_ratio/low_min": 9.66116931522265e-06,
+      "clip_ratio/region_mean": 5.257154271021136e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16368.0,
+      "completions/mean_length": 7000.8203125,
+      "completions/mean_terminated_length": 6926.93701171875,
+      "completions/min_length": 456.0,
+      "completions/min_terminated_length": 456.0,
+      "entropy": 0.8918163478374481,
+      "epoch": 0.33394664213431463,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.003008107887580991,
+      "learning_rate": 1e-05,
+      "loss": 0.0862,
+      "num_tokens": 299653249.0,
+      "reward": 0.453125,
+      "reward_std": 0.3322049677371979,
+      "rewards/accuracy_reward/mean": 0.453125,
+      "rewards/accuracy_reward/std": 0.4997538626194,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999739527702332,
+      "sampling/importance_sampling_ratio/min": 0.002478980226442218,
+      "sampling/sampling_logp_difference/max": 5.999907970428467,
+      "sampling/sampling_logp_difference/mean": 0.020022090524435043,
+      "step": 363
+    },
+    {
+      "clip_ratio/high_max": 1.5043352505017538e-05,
+      "clip_ratio/high_mean": 3.7608381262543844e-06,
+      "clip_ratio/low_mean": 8.800596447144926e-06,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 1.2561434687086148e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 16375.0,
+      "completions/max_terminated_length": 16375.0,
+      "completions/mean_length": 7319.578125,
+      "completions/mean_terminated_length": 7319.578125,
+      "completions/min_length": 1974.0,
+      "completions/min_terminated_length": 1974.0,
+      "entropy": 0.9145128801465034,
+      "epoch": 0.33486660533578655,
+      "frac_reward_zero_std": 0.6875,
+      "grad_norm": 0.0010370119707658887,
+      "learning_rate": 1e-05,
+      "loss": 0.0138,
+      "num_tokens": 300608099.0,
+      "reward": 0.4609375,
+      "reward_std": 0.1412346363067627,
+      "rewards/accuracy_reward/mean": 0.4609375,
+      "rewards/accuracy_reward/std": 0.5004304051399231,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999158382415771,
+      "sampling/importance_sampling_ratio/min": 0.00012156071898061782,
+      "sampling/sampling_logp_difference/max": 9.015096664428711,
+      "sampling/sampling_logp_difference/mean": 0.019386455416679382,
+      "step": 364
+    },
+    {
+      "clip_ratio/high_max": 9.589830597178661e-06,
+      "clip_ratio/high_mean": 2.3974576492946653e-06,
+      "clip_ratio/low_mean": 2.2494899667435675e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.4892357714634272e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16316.0,
+      "completions/mean_length": 6956.90625,
+      "completions/mean_terminated_length": 6882.67724609375,
+      "completions/min_length": 769.0,
+      "completions/min_terminated_length": 769.0,
+      "entropy": 0.9679212644696236,
+      "epoch": 0.3357865685372585,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0021569218952208757,
+      "learning_rate": 1e-05,
+      "loss": 0.0621,
+      "num_tokens": 301516535.0,
+      "reward": 0.4765625,
+      "reward_std": 0.23462772369384766,
+      "rewards/accuracy_reward/mean": 0.4765625,
+      "rewards/accuracy_reward/std": 0.5014128684997559,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999587535858154,
+      "sampling/importance_sampling_ratio/min": 0.01621459797024727,
+      "sampling/sampling_logp_difference/max": 4.121843338012695,
+      "sampling/sampling_logp_difference/mean": 0.020638462156057358,
+      "step": 365
+    },
+    {
+      "clip_ratio/high_max": 1.1957331025769236e-05,
+      "clip_ratio/high_mean": 2.989332756442309e-06,
+      "clip_ratio/low_mean": 2.334770033485256e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.6337033204981708e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16343.0,
+      "completions/mean_length": 6933.1953125,
+      "completions/mean_terminated_length": 6706.37646484375,
+      "completions/min_length": 979.0,
+      "completions/min_terminated_length": 979.0,
+      "entropy": 0.9610472694039345,
+      "epoch": 0.33670653173873044,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0019900640472769737,
+      "learning_rate": 1e-05,
+      "loss": 0.0329,
+      "num_tokens": 302422120.0,
+      "reward": 0.4921875,
+      "reward_std": 0.22908620536327362,
+      "rewards/accuracy_reward/mean": 0.4921875,
+      "rewards/accuracy_reward/std": 0.5019033551216125,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999517202377319,
+      "sampling/importance_sampling_ratio/min": 7.346414143682978e-09,
+      "sampling/sampling_logp_difference/max": 18.729053497314453,
+      "sampling/sampling_logp_difference/mean": 0.020782412961125374,
+      "step": 366
+    },
+    {
+      "clip_ratio/high_max": 1.6365190958822495e-05,
+      "clip_ratio/high_mean": 4.091297739705624e-06,
+      "clip_ratio/low_mean": 2.5385876426753384e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.9477173825398495e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15799.0,
+      "completions/max_terminated_length": 15799.0,
+      "completions/mean_length": 6711.640625,
+      "completions/mean_terminated_length": 6711.640625,
+      "completions/min_length": 814.0,
+      "completions/min_terminated_length": 814.0,
+      "entropy": 0.8035724982619286,
+      "epoch": 0.3376264949402024,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.001954294042661786,
+      "learning_rate": 1e-05,
+      "loss": 0.0264,
+      "num_tokens": 303299402.0,
+      "reward": 0.4765625,
+      "reward_std": 0.2856517732143402,
+      "rewards/accuracy_reward/mean": 0.4765625,
+      "rewards/accuracy_reward/std": 0.5014128684997559,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000114440917969,
+      "sampling/importance_sampling_ratio/min": 0.002623806707561016,
+      "sampling/sampling_logp_difference/max": 5.943129062652588,
+      "sampling/sampling_logp_difference/mean": 0.018188728019595146,
+      "step": 367
+    },
+    {
+      "clip_ratio/high_max": 8.633360948806512e-06,
+      "clip_ratio/high_mean": 2.158340237201628e-06,
+      "clip_ratio/low_mean": 3.7187305906627444e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.9345645916455396e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15980.0,
+      "completions/mean_length": 6977.890625,
+      "completions/mean_terminated_length": 6674.4677734375,
+      "completions/min_length": 737.0,
+      "completions/min_terminated_length": 737.0,
+      "entropy": 0.9545647650957108,
+      "epoch": 0.33854645814167433,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.0022571857552975416,
+      "learning_rate": 1e-05,
+      "loss": 0.0187,
+      "num_tokens": 304210412.0,
+      "reward": 0.4375,
+      "reward_std": 0.19568344950675964,
+      "rewards/accuracy_reward/mean": 0.4375,
+      "rewards/accuracy_reward/std": 0.49802759289741516,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999645948410034,
+      "sampling/importance_sampling_ratio/min": 5.501153282239102e-06,
+      "sampling/sampling_logp_difference/max": 12.110552787780762,
+      "sampling/sampling_logp_difference/mean": 0.021196123212575912,
+      "step": 368
+    },
+    {
+      "clip_ratio/high_max": 1.2197504474897869e-05,
+      "clip_ratio/high_mean": 3.0493761187244672e-06,
+      "clip_ratio/low_mean": 2.7975384682576987e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.1024760801301454e-05,
+      "completions/clipped_ratio": 0.0546875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16087.0,
+      "completions/mean_length": 5952.8359375,
+      "completions/mean_terminated_length": 5349.3798828125,
+      "completions/min_length": 651.0,
+      "completions/min_terminated_length": 651.0,
+      "entropy": 0.846152663230896,
+      "epoch": 0.33946642134314625,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.003541936632245779,
+      "learning_rate": 1e-05,
+      "loss": 0.0897,
+      "num_tokens": 304989015.0,
+      "reward": 0.4453125,
+      "reward_std": 0.3022122383117676,
+      "rewards/accuracy_reward/mean": 0.4453125,
+      "rewards/accuracy_reward/std": 0.4989531338214874,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998842477798462,
+      "sampling/importance_sampling_ratio/min": 0.0019083521328866482,
+      "sampling/sampling_logp_difference/max": 6.261515140533447,
+      "sampling/sampling_logp_difference/mean": 0.018978029489517212,
+      "step": 369
+    },
+    {
+      "clip_ratio/high_max": 1.1725882586688385e-05,
+      "clip_ratio/high_mean": 2.9314706466720963e-06,
+      "clip_ratio/low_mean": 6.290217379500973e-05,
+      "clip_ratio/low_min": 1.226112590302364e-05,
+      "clip_ratio/region_mean": 6.583364438483841e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16098.0,
+      "completions/mean_length": 7976.9296875,
+      "completions/mean_terminated_length": 7635.1787109375,
+      "completions/min_length": 514.0,
+      "completions/min_terminated_length": 514.0,
+      "entropy": 0.9827005565166473,
+      "epoch": 0.3403863845446182,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0023713603150099516,
+      "learning_rate": 1e-05,
+      "loss": 0.0668,
+      "num_tokens": 306032054.0,
+      "reward": 0.3046875,
+      "reward_std": 0.2527809143066406,
+      "rewards/accuracy_reward/mean": 0.3046875,
+      "rewards/accuracy_reward/std": 0.46208351850509644,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000019073486328,
+      "sampling/importance_sampling_ratio/min": 3.2563195873080986e-07,
+      "sampling/sampling_logp_difference/max": 14.937498092651367,
+      "sampling/sampling_logp_difference/mean": 0.0217706598341465,
+      "step": 370
+    },
+    {
+      "clip_ratio/high_max": 2.3902987095425487e-05,
+      "clip_ratio/high_mean": 7.721868257704045e-06,
+      "clip_ratio/low_mean": 4.01184702241153e-05,
+      "clip_ratio/low_min": 1.341508686891757e-05,
+      "clip_ratio/region_mean": 4.784033922078379e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16369.0,
+      "completions/mean_length": 7117.8828125,
+      "completions/mean_terminated_length": 6895.49609375,
+      "completions/min_length": 1314.0,
+      "completions/min_terminated_length": 1314.0,
+      "entropy": 0.8897347301244736,
+      "epoch": 0.34130634774609014,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0023132229689508677,
+      "learning_rate": 1e-05,
+      "loss": 0.162,
+      "num_tokens": 306960599.0,
+      "reward": 0.515625,
+      "reward_std": 0.34822866320610046,
+      "rewards/accuracy_reward/mean": 0.515625,
+      "rewards/accuracy_reward/std": 0.5017194747924805,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999181032180786,
+      "sampling/importance_sampling_ratio/min": 0.0007341355667449534,
+      "sampling/sampling_logp_difference/max": 7.2168169021606445,
+      "sampling/sampling_logp_difference/mean": 0.018669119104743004,
+      "step": 371
+    },
+    {
+      "clip_ratio/high_max": 4.371240720502101e-06,
+      "clip_ratio/high_mean": 1.0928101801255252e-06,
+      "clip_ratio/low_mean": 4.9660218792269006e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.075302897239453e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15995.0,
+      "completions/mean_length": 6971.0390625,
+      "completions/mean_terminated_length": 6745.12841796875,
+      "completions/min_length": 871.0,
+      "completions/min_terminated_length": 871.0,
+      "entropy": 1.0919678956270218,
+      "epoch": 0.3422263109475621,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0030236958991736174,
+      "learning_rate": 1e-05,
+      "loss": 0.0754,
+      "num_tokens": 307873100.0,
+      "reward": 0.3359375,
+      "reward_std": 0.34245961904525757,
+      "rewards/accuracy_reward/mean": 0.3359375,
+      "rewards/accuracy_reward/std": 0.47417303919792175,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000091791152954,
+      "sampling/importance_sampling_ratio/min": 0.01082979142665863,
+      "sampling/sampling_logp_difference/max": 4.525454521179199,
+      "sampling/sampling_logp_difference/mean": 0.022024717181921005,
+      "step": 372
+    },
+    {
+      "clip_ratio/high_max": 4.341634394222638e-06,
+      "clip_ratio/high_mean": 1.0854085985556594e-06,
+      "clip_ratio/low_mean": 3.061858558339736e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.170399429563986e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14442.0,
+      "completions/mean_length": 7120.0,
+      "completions/mean_terminated_length": 6897.66455078125,
+      "completions/min_length": 1685.0,
+      "completions/min_terminated_length": 1685.0,
+      "entropy": 1.0812252908945084,
+      "epoch": 0.34314627414903404,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0018919071881100535,
+      "learning_rate": 1e-05,
+      "loss": 0.0542,
+      "num_tokens": 308804876.0,
+      "reward": 0.28125,
+      "reward_std": 0.2522490322589874,
+      "rewards/accuracy_reward/mean": 0.28125,
+      "rewards/accuracy_reward/std": 0.4513758420944214,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999696612358093,
+      "sampling/importance_sampling_ratio/min": 0.0011743507348001003,
+      "sampling/sampling_logp_difference/max": 6.747039794921875,
+      "sampling/sampling_logp_difference/mean": 0.022177904844284058,
+      "step": 373
+    },
+    {
+      "clip_ratio/high_max": 4.6198765630833805e-06,
+      "clip_ratio/high_mean": 1.1549691407708451e-06,
+      "clip_ratio/low_mean": 1.3996559573570266e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 1.5151528714341111e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15814.0,
+      "completions/mean_length": 7344.5546875,
+      "completions/mean_terminated_length": 6977.09716796875,
+      "completions/min_length": 127.0,
+      "completions/min_terminated_length": 127.0,
+      "entropy": 0.9340410158038139,
+      "epoch": 0.34406623735050595,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.001848200336098671,
+      "learning_rate": 1e-05,
+      "loss": 0.0195,
+      "num_tokens": 309762603.0,
+      "reward": 0.4296875,
+      "reward_std": 0.2188364714384079,
+      "rewards/accuracy_reward/mean": 0.4296875,
+      "rewards/accuracy_reward/std": 0.4969765841960907,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999948143959045,
+      "sampling/importance_sampling_ratio/min": 0.0002964614541269839,
+      "sampling/sampling_logp_difference/max": 8.1235933303833,
+      "sampling/sampling_logp_difference/mean": 0.02034556306898594,
+      "step": 374
+    },
+    {
+      "clip_ratio/high_max": 1.3913735983805964e-05,
+      "clip_ratio/high_mean": 3.478433995951491e-06,
+      "clip_ratio/low_mean": 2.4544106395296694e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.8022539936500834e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15244.0,
+      "completions/max_terminated_length": 15244.0,
+      "completions/mean_length": 6615.6484375,
+      "completions/mean_terminated_length": 6615.6484375,
+      "completions/min_length": 116.0,
+      "completions/min_terminated_length": 116.0,
+      "entropy": 0.971637412905693,
+      "epoch": 0.34498620055197793,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0012123315827921033,
+      "learning_rate": 1e-05,
+      "loss": 0.0581,
+      "num_tokens": 310628230.0,
+      "reward": 0.4296875,
+      "reward_std": 0.2948455810546875,
+      "rewards/accuracy_reward/mean": 0.4296875,
+      "rewards/accuracy_reward/std": 0.4969765841960907,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999868869781494,
+      "sampling/importance_sampling_ratio/min": 2.587145718280226e-05,
+      "sampling/sampling_logp_difference/max": 10.562370300292969,
+      "sampling/sampling_logp_difference/mean": 0.020877305418252945,
+      "step": 375
+    },
+    {
+      "clip_ratio/high_max": 6.119951194705209e-06,
+      "clip_ratio/high_mean": 1.5299877986763022e-06,
+      "clip_ratio/low_mean": 4.789722436271404e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.942721272982453e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16028.0,
+      "completions/mean_length": 6333.84375,
+      "completions/mean_terminated_length": 6009.64501953125,
+      "completions/min_length": 564.0,
+      "completions/min_terminated_length": 564.0,
+      "entropy": 0.9569023698568344,
+      "epoch": 0.34590616375344985,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.002646032487973571,
+      "learning_rate": 1e-05,
+      "loss": 0.086,
+      "num_tokens": 311457466.0,
+      "reward": 0.4453125,
+      "reward_std": 0.34928950667381287,
+      "rewards/accuracy_reward/mean": 0.4453125,
+      "rewards/accuracy_reward/std": 0.4989531338214874,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000247955322266,
+      "sampling/importance_sampling_ratio/min": 0.022760435938835144,
+      "sampling/sampling_logp_difference/max": 3.782731533050537,
+      "sampling/sampling_logp_difference/mean": 0.020464638248085976,
+      "step": 376
+    },
+    {
+      "clip_ratio/high_max": 1.8126566374121467e-05,
+      "clip_ratio/high_mean": 4.531641593530367e-06,
+      "clip_ratio/low_mean": 4.1024483266483e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.5556124632639694e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15719.0,
+      "completions/mean_length": 6657.8515625,
+      "completions/mean_terminated_length": 6503.46875,
+      "completions/min_length": 594.0,
+      "completions/min_terminated_length": 594.0,
+      "entropy": 1.029910758137703,
+      "epoch": 0.3468261269549218,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0021437006071209908,
+      "learning_rate": 1e-05,
+      "loss": -0.0212,
+      "num_tokens": 312330879.0,
+      "reward": 0.4453125,
+      "reward_std": 0.25354230403900146,
+      "rewards/accuracy_reward/mean": 0.4453125,
+      "rewards/accuracy_reward/std": 0.4989531338214874,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.000024437904358,
+      "sampling/importance_sampling_ratio/min": 0.020200612023472786,
+      "sampling/sampling_logp_difference/max": 3.9020423889160156,
+      "sampling/sampling_logp_difference/mean": 0.021411258727312088,
+      "step": 377
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 2.7961265118392475e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.7961265118392475e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16311.0,
+      "completions/mean_length": 7657.8359375,
+      "completions/mean_terminated_length": 7376.3466796875,
+      "completions/min_length": 741.0,
+      "completions/min_terminated_length": 741.0,
+      "entropy": 0.9699486121535301,
+      "epoch": 0.34774609015639374,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.0018965511117130518,
+      "learning_rate": 1e-05,
+      "loss": 0.066,
+      "num_tokens": 313331898.0,
+      "reward": 0.3515625,
+      "reward_std": 0.18884865939617157,
+      "rewards/accuracy_reward/mean": 0.3515625,
+      "rewards/accuracy_reward/std": 0.4793342351913452,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.000010371208191,
+      "sampling/importance_sampling_ratio/min": 7.867415661166888e-06,
+      "sampling/sampling_logp_difference/max": 11.75278091430664,
+      "sampling/sampling_logp_difference/mean": 0.021029409021139145,
+      "step": 378
+    },
+    {
+      "clip_ratio/high_max": 7.721664815107943e-06,
+      "clip_ratio/high_mean": 2.7168170504410227e-06,
+      "clip_ratio/low_mean": 4.313065619498957e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.584747375702136e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14619.0,
+      "completions/mean_length": 7085.3671875,
+      "completions/mean_terminated_length": 6937.77001953125,
+      "completions/min_length": 28.0,
+      "completions/min_terminated_length": 28.0,
+      "entropy": 1.0943557620048523,
+      "epoch": 0.3486660533578657,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0016498853219673038,
+      "learning_rate": 1e-05,
+      "loss": 0.0346,
+      "num_tokens": 314258601.0,
+      "reward": 0.3203125,
+      "reward_std": 0.24329257011413574,
+      "rewards/accuracy_reward/mean": 0.3203125,
+      "rewards/accuracy_reward/std": 0.4684300124645233,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.000105857849121,
+      "sampling/importance_sampling_ratio/min": 0.03447282314300537,
+      "sampling/sampling_logp_difference/max": 3.367583990097046,
+      "sampling/sampling_logp_difference/mean": 0.021414825692772865,
+      "step": 379
+    },
+    {
+      "clip_ratio/high_max": 7.953489330247976e-06,
+      "clip_ratio/high_mean": 1.988372332561994e-06,
+      "clip_ratio/low_mean": 3.479703536868328e-05,
+      "clip_ratio/low_min": 2.6767741019284585e-06,
+      "clip_ratio/region_mean": 3.6785407701245276e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15631.0,
+      "completions/mean_length": 7614.1171875,
+      "completions/mean_terminated_length": 7182.81103515625,
+      "completions/min_length": 511.0,
+      "completions/min_terminated_length": 511.0,
+      "entropy": 0.9673903658986092,
+      "epoch": 0.34958601655933763,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.001364902127534151,
+      "learning_rate": 1e-05,
+      "loss": 0.0041,
+      "num_tokens": 315256840.0,
+      "reward": 0.4296875,
+      "reward_std": 0.3503454327583313,
+      "rewards/accuracy_reward/mean": 0.4296875,
+      "rewards/accuracy_reward/std": 0.4969765841960907,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999970018863678,
+      "sampling/importance_sampling_ratio/min": 6.874255632283166e-05,
+      "sampling/sampling_logp_difference/max": 9.585142135620117,
+      "sampling/sampling_logp_difference/mean": 0.02000460773706436,
+      "step": 380
+    },
+    {
+      "clip_ratio/high_max": 6.980824764468707e-06,
+      "clip_ratio/high_mean": 1.7452061911171768e-06,
+      "clip_ratio/low_mean": 4.410173994529032e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.5846945681660145e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15637.0,
+      "completions/mean_length": 7766.9375,
+      "completions/mean_terminated_length": 7630.1591796875,
+      "completions/min_length": 57.0,
+      "completions/min_terminated_length": 57.0,
+      "entropy": 1.0277370810508728,
+      "epoch": 0.35050597976080955,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.002171436557546258,
+      "learning_rate": 1e-05,
+      "loss": 0.0705,
+      "num_tokens": 316268976.0,
+      "reward": 0.34375,
+      "reward_std": 0.24435341358184814,
+      "rewards/accuracy_reward/mean": 0.34375,
+      "rewards/accuracy_reward/std": 0.47682511806488037,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999735951423645,
+      "sampling/importance_sampling_ratio/min": 7.485197420464829e-05,
+      "sampling/sampling_logp_difference/max": 9.499998092651367,
+      "sampling/sampling_logp_difference/mean": 0.021251089870929718,
+      "step": 381
+    },
+    {
+      "clip_ratio/high_max": 9.843256520980503e-06,
+      "clip_ratio/high_mean": 3.5061395919910865e-06,
+      "clip_ratio/low_mean": 3.973216325903195e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.323830307839671e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15752.0,
+      "completions/mean_length": 7774.2265625,
+      "completions/mean_terminated_length": 7567.59228515625,
+      "completions/min_length": 595.0,
+      "completions/min_terminated_length": 595.0,
+      "entropy": 1.0064171329140663,
+      "epoch": 0.3514259429622815,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0013348929351195693,
+      "learning_rate": 1e-05,
+      "loss": 0.0336,
+      "num_tokens": 317285677.0,
+      "reward": 0.28125,
+      "reward_std": 0.23934084177017212,
+      "rewards/accuracy_reward/mean": 0.28125,
+      "rewards/accuracy_reward/std": 0.4513758420944214,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999904632568359,
+      "sampling/importance_sampling_ratio/min": 1.7632934259381727e-06,
+      "sampling/sampling_logp_difference/max": 13.248327255249023,
+      "sampling/sampling_logp_difference/mean": 0.022232960909605026,
+      "step": 382
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 3.2021426648043416e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.2021426648043416e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16208.0,
+      "completions/mean_length": 6547.1796875,
+      "completions/mean_terminated_length": 6469.724609375,
+      "completions/min_length": 894.0,
+      "completions/min_terminated_length": 894.0,
+      "entropy": 0.9192209765315056,
+      "epoch": 0.35234590616375344,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.002925361506640911,
+      "learning_rate": 1e-05,
+      "loss": 0.0809,
+      "num_tokens": 318148276.0,
+      "reward": 0.515625,
+      "reward_std": 0.24435341358184814,
+      "rewards/accuracy_reward/mean": 0.515625,
+      "rewards/accuracy_reward/std": 0.5017194747924805,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999691843986511,
+      "sampling/importance_sampling_ratio/min": 7.411971182591515e-06,
+      "sampling/sampling_logp_difference/max": 11.812414169311523,
+      "sampling/sampling_logp_difference/mean": 0.020470617339015007,
+      "step": 383
+    },
+    {
+      "clip_ratio/high_max": 1.543848429719219e-05,
+      "clip_ratio/high_mean": 3.8596210742980475e-06,
+      "clip_ratio/low_mean": 2.0332364726982632e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.4191985573907004e-05,
+      "completions/clipped_ratio": 0.078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15923.0,
+      "completions/mean_length": 6904.40625,
+      "completions/mean_terminated_length": 6101.05078125,
+      "completions/min_length": 964.0,
+      "completions/min_terminated_length": 964.0,
+      "entropy": 0.9611739367246628,
+      "epoch": 0.3532658693652254,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.002288331277668476,
+      "learning_rate": 1e-05,
+      "loss": 0.0388,
+      "num_tokens": 319052224.0,
+      "reward": 0.390625,
+      "reward_std": 0.23645779490470886,
+      "rewards/accuracy_reward/mean": 0.390625,
+      "rewards/accuracy_reward/std": 0.4898075461387634,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999583959579468,
+      "sampling/importance_sampling_ratio/min": 1.0906596799031831e-05,
+      "sampling/sampling_logp_difference/max": 11.426142692565918,
+      "sampling/sampling_logp_difference/mean": 0.02049478143453598,
+      "step": 384
+    },
+    {
+      "clip_ratio/high_max": 1.0430391284899088e-05,
+      "clip_ratio/high_mean": 3.662984454422258e-06,
+      "clip_ratio/low_mean": 3.791802066643868e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.158100534823461e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16195.0,
+      "completions/mean_length": 7632.359375,
+      "completions/mean_terminated_length": 7350.04833984375,
+      "completions/min_length": 435.0,
+      "completions/min_terminated_length": 435.0,
+      "entropy": 1.0255606770515442,
+      "epoch": 0.35418583256669733,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0015405584126710892,
+      "learning_rate": 1e-05,
+      "loss": 0.111,
+      "num_tokens": 320051534.0,
+      "reward": 0.3515625,
+      "reward_std": 0.30327799916267395,
+      "rewards/accuracy_reward/mean": 0.3515625,
+      "rewards/accuracy_reward/std": 0.4793342351913452,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000008344650269,
+      "sampling/importance_sampling_ratio/min": 0.00014919505338184536,
+      "sampling/sampling_logp_difference/max": 8.810256004333496,
+      "sampling/sampling_logp_difference/mean": 0.021682340651750565,
+      "step": 385
+    },
+    {
+      "clip_ratio/high_max": 1.10081018647179e-05,
+      "clip_ratio/high_mean": 2.752025466179475e-06,
+      "clip_ratio/low_mean": 2.2116193804322393e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.4868219043128192e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14348.0,
+      "completions/mean_length": 6260.8828125,
+      "completions/mean_terminated_length": 6100.19873046875,
+      "completions/min_length": 1371.0,
+      "completions/min_terminated_length": 1371.0,
+      "entropy": 0.7945073395967484,
+      "epoch": 0.35510579576816925,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.00142462900839746,
+      "learning_rate": 1e-05,
+      "loss": 0.0707,
+      "num_tokens": 320872143.0,
+      "reward": 0.53125,
+      "reward_std": 0.18990948796272278,
+      "rewards/accuracy_reward/mean": 0.53125,
+      "rewards/accuracy_reward/std": 0.5009832978248596,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999794960021973,
+      "sampling/importance_sampling_ratio/min": 0.0003461402375251055,
+      "sampling/sampling_logp_difference/max": 7.9686665534973145,
+      "sampling/sampling_logp_difference/mean": 0.018331468105316162,
+      "step": 386
+    },
+    {
+      "clip_ratio/high_max": 8.952299140219111e-06,
+      "clip_ratio/high_mean": 2.2380747850547777e-06,
+      "clip_ratio/low_mean": 2.7251681331108557e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.9489756570910686e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15807.0,
+      "completions/mean_length": 7761.375,
+      "completions/mean_terminated_length": 7693.48046875,
+      "completions/min_length": 765.0,
+      "completions/min_terminated_length": 765.0,
+      "entropy": 1.0799954682588577,
+      "epoch": 0.3560257589696412,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0019111793953925371,
+      "learning_rate": 1e-05,
+      "loss": 0.0527,
+      "num_tokens": 321885447.0,
+      "reward": 0.390625,
+      "reward_std": 0.2806519567966461,
+      "rewards/accuracy_reward/mean": 0.390625,
+      "rewards/accuracy_reward/std": 0.4898075461387634,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999994039535522,
+      "sampling/importance_sampling_ratio/min": 0.00028313760412856936,
+      "sampling/sampling_logp_difference/max": 8.169577598571777,
+      "sampling/sampling_logp_difference/mean": 0.02205459028482437,
+      "step": 387
+    },
+    {
+      "clip_ratio/high_max": 1.6241773209912935e-05,
+      "clip_ratio/high_mean": 5.09954668359569e-06,
+      "clip_ratio/low_mean": 4.549925756691664e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.0598803454704466e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15808.0,
+      "completions/mean_length": 7294.796875,
+      "completions/mean_terminated_length": 7076.65625,
+      "completions/min_length": 662.0,
+      "completions/min_terminated_length": 662.0,
+      "entropy": 0.8159547671675682,
+      "epoch": 0.35694572217111314,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.001482969499193132,
+      "learning_rate": 1e-05,
+      "loss": 0.0502,
+      "num_tokens": 322838797.0,
+      "reward": 0.5234375,
+      "reward_std": 0.36007601022720337,
+      "rewards/accuracy_reward/mean": 0.5234375,
+      "rewards/accuracy_reward/std": 0.5014128684997559,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999374151229858,
+      "sampling/importance_sampling_ratio/min": 4.2558355062283226e-07,
+      "sampling/sampling_logp_difference/max": 14.669804573059082,
+      "sampling/sampling_logp_difference/mean": 0.01850103959441185,
+      "step": 388
+    },
+    {
+      "clip_ratio/high_max": 4.2527130972302984e-06,
+      "clip_ratio/high_mean": 1.7856882550404407e-06,
+      "clip_ratio/low_mean": 2.875013205994037e-05,
+      "clip_ratio/low_min": 3.824852228717646e-06,
+      "clip_ratio/region_mean": 3.053582031498081e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15897.0,
+      "completions/mean_length": 7229.875,
+      "completions/mean_terminated_length": 6779.671875,
+      "completions/min_length": 579.0,
+      "completions/min_terminated_length": 579.0,
+      "entropy": 0.9420096501708031,
+      "epoch": 0.3578656853725851,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.001755179837346077,
+      "learning_rate": 1e-05,
+      "loss": 0.075,
+      "num_tokens": 323782333.0,
+      "reward": 0.3984375,
+      "reward_std": 0.24541424214839935,
+      "rewards/accuracy_reward/mean": 0.3984375,
+      "rewards/accuracy_reward/std": 0.4915000796318054,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999120831489563,
+      "sampling/importance_sampling_ratio/min": 7.437798922182992e-05,
+      "sampling/sampling_logp_difference/max": 9.50635051727295,
+      "sampling/sampling_logp_difference/mean": 0.02008935809135437,
+      "step": 389
+    },
+    {
+      "clip_ratio/high_max": 8.81059531820938e-06,
+      "clip_ratio/high_mean": 2.202648829552345e-06,
+      "clip_ratio/low_mean": 2.0493020770118164e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.269566959967051e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16016.0,
+      "completions/mean_length": 6628.8359375,
+      "completions/mean_terminated_length": 6473.99267578125,
+      "completions/min_length": 851.0,
+      "completions/min_terminated_length": 851.0,
+      "entropy": 1.0327190533280373,
+      "epoch": 0.35878564857405704,
+      "frac_reward_zero_std": 0.625,
+      "grad_norm": 0.00202886201441288,
+      "learning_rate": 1e-05,
+      "loss": 0.0549,
+      "num_tokens": 324648848.0,
+      "reward": 0.421875,
+      "reward_std": 0.15650184452533722,
+      "rewards/accuracy_reward/mean": 0.421875,
+      "rewards/accuracy_reward/std": 0.4957992732524872,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999722242355347,
+      "sampling/importance_sampling_ratio/min": 0.028374243527650833,
+      "sampling/sampling_logp_difference/max": 3.5622735023498535,
+      "sampling/sampling_logp_difference/mean": 0.021120186895132065,
+      "step": 390
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 5.5018343005031056e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.5018343005031056e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15405.0,
+      "completions/mean_length": 7402.4140625,
+      "completions/mean_terminated_length": 7259.849609375,
+      "completions/min_length": 367.0,
+      "completions/min_terminated_length": 367.0,
+      "entropy": 1.0335597470402718,
+      "epoch": 0.35970561177552896,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0026126320008188486,
+      "learning_rate": 1e-05,
+      "loss": 0.0271,
+      "num_tokens": 325617965.0,
+      "reward": 0.328125,
+      "reward_std": 0.21436560153961182,
+      "rewards/accuracy_reward/mean": 0.328125,
+      "rewards/accuracy_reward/std": 0.4713755249977112,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000274181365967,
+      "sampling/importance_sampling_ratio/min": 0.002047094516456127,
+      "sampling/sampling_logp_difference/max": 6.191333770751953,
+      "sampling/sampling_logp_difference/mean": 0.021358007565140724,
+      "step": 391
+    },
+    {
+      "clip_ratio/high_max": 1.7713674878905294e-05,
+      "clip_ratio/high_mean": 5.139017389410583e-06,
+      "clip_ratio/low_mean": 4.4972417526878417e-05,
+      "clip_ratio/low_min": 8.263916242867708e-06,
+      "clip_ratio/region_mean": 5.0111435712096863e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16246.0,
+      "completions/mean_length": 7414.046875,
+      "completions/mean_terminated_length": 7124.693359375,
+      "completions/min_length": 467.0,
+      "completions/min_terminated_length": 467.0,
+      "entropy": 1.043906107544899,
+      "epoch": 0.36062557497700093,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.004497586749494076,
+      "learning_rate": 1e-05,
+      "loss": 0.0558,
+      "num_tokens": 326583819.0,
+      "reward": 0.4140625,
+      "reward_std": 0.29538238048553467,
+      "rewards/accuracy_reward/mean": 0.4140625,
+      "rewards/accuracy_reward/std": 0.49449479579925537,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999969005584717,
+      "sampling/importance_sampling_ratio/min": 0.0015032986411824822,
+      "sampling/sampling_logp_difference/max": 6.500093460083008,
+      "sampling/sampling_logp_difference/mean": 0.021614551544189453,
+      "step": 392
+    },
+    {
+      "clip_ratio/high_max": 2.2412414182326756e-05,
+      "clip_ratio/high_mean": 5.603103545581689e-06,
+      "clip_ratio/low_mean": 2.0601042535872693e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.620414619514122e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 14288.0,
+      "completions/max_terminated_length": 14288.0,
+      "completions/mean_length": 7090.5,
+      "completions/mean_terminated_length": 7090.5,
+      "completions/min_length": 1183.0,
+      "completions/min_terminated_length": 1183.0,
+      "entropy": 0.9755794927477837,
+      "epoch": 0.36154553817847285,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0026554155629128218,
+      "learning_rate": 1e-05,
+      "loss": 0.0468,
+      "num_tokens": 327512315.0,
+      "reward": 0.53125,
+      "reward_std": 0.27722427248954773,
+      "rewards/accuracy_reward/mean": 0.53125,
+      "rewards/accuracy_reward/std": 0.5009832978248596,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999885618686676,
+      "sampling/importance_sampling_ratio/min": 7.104578980943188e-05,
+      "sampling/sampling_logp_difference/max": 9.552186012268066,
+      "sampling/sampling_logp_difference/mean": 0.020926889032125473,
+      "step": 393
+    },
+    {
+      "clip_ratio/high_max": 3.259367531427415e-06,
+      "clip_ratio/high_mean": 1.5600960523443064e-06,
+      "clip_ratio/low_mean": 3.035687961983058e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.191697578586172e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15710.0,
+      "completions/mean_length": 7200.140625,
+      "completions/mean_terminated_length": 7127.82666015625,
+      "completions/min_length": 80.0,
+      "completions/min_terminated_length": 80.0,
+      "entropy": 0.9084664657711983,
+      "epoch": 0.3624655013799448,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0018455780809745193,
+      "learning_rate": 1e-05,
+      "loss": -0.0024,
+      "num_tokens": 328454269.0,
+      "reward": 0.3828125,
+      "reward_std": 0.2301519364118576,
+      "rewards/accuracy_reward/mean": 0.3828125,
+      "rewards/accuracy_reward/std": 0.4879830479621887,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999072551727295,
+      "sampling/importance_sampling_ratio/min": 0.00033894419902935624,
+      "sampling/sampling_logp_difference/max": 7.989675045013428,
+      "sampling/sampling_logp_difference/mean": 0.01939154416322708,
+      "step": 394
+    },
+    {
+      "clip_ratio/high_max": 1.0260662747896276e-05,
+      "clip_ratio/high_mean": 2.565165686974069e-06,
+      "clip_ratio/low_mean": 3.0616293088314706e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.318145900266245e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16296.0,
+      "completions/mean_length": 6977.5234375,
+      "completions/mean_terminated_length": 6674.08837890625,
+      "completions/min_length": 193.0,
+      "completions/min_terminated_length": 193.0,
+      "entropy": 0.9352559298276901,
+      "epoch": 0.36338546458141674,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0021165197249501944,
+      "learning_rate": 1e-05,
+      "loss": 0.06,
+      "num_tokens": 329366400.0,
+      "reward": 0.4453125,
+      "reward_std": 0.26485776901245117,
+      "rewards/accuracy_reward/mean": 0.4453125,
+      "rewards/accuracy_reward/std": 0.4989531338214874,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000401735305786,
+      "sampling/importance_sampling_ratio/min": 0.034073151648044586,
+      "sampling/sampling_logp_difference/max": 3.3792455196380615,
+      "sampling/sampling_logp_difference/mean": 0.020020857453346252,
+      "step": 395
+    },
+    {
+      "clip_ratio/high_max": 2.777207805593207e-05,
+      "clip_ratio/high_mean": 6.9430195139830175e-06,
+      "clip_ratio/low_mean": 4.1006693436429487e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.794971300725592e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16230.0,
+      "completions/mean_length": 7819.828125,
+      "completions/mean_terminated_length": 7398.63916015625,
+      "completions/min_length": 1273.0,
+      "completions/min_terminated_length": 1273.0,
+      "entropy": 1.0045175030827522,
+      "epoch": 0.36430542778288866,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0022391832899302244,
+      "learning_rate": 1e-05,
+      "loss": 0.1424,
+      "num_tokens": 330386442.0,
+      "reward": 0.453125,
+      "reward_std": 0.29302334785461426,
+      "rewards/accuracy_reward/mean": 0.453125,
+      "rewards/accuracy_reward/std": 0.4997538626194,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999501705169678,
+      "sampling/importance_sampling_ratio/min": 0.0002908352471422404,
+      "sampling/sampling_logp_difference/max": 8.142753601074219,
+      "sampling/sampling_logp_difference/mean": 0.021083837375044823,
+      "step": 396
+    },
+    {
+      "clip_ratio/high_max": 8.042205081437714e-06,
+      "clip_ratio/high_mean": 2.0105512703594286e-06,
+      "clip_ratio/low_mean": 3.623322004386864e-05,
+      "clip_ratio/low_min": 5.5314631026703864e-06,
+      "clip_ratio/region_mean": 3.8243771086854395e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16135.0,
+      "completions/mean_length": 6872.0859375,
+      "completions/mean_terminated_length": 6485.42236328125,
+      "completions/min_length": 609.0,
+      "completions/min_terminated_length": 609.0,
+      "entropy": 0.8501477539539337,
+      "epoch": 0.36522539098436063,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.002260354580357671,
+      "learning_rate": 1e-05,
+      "loss": 0.0503,
+      "num_tokens": 331286181.0,
+      "reward": 0.4921875,
+      "reward_std": 0.2280302792787552,
+      "rewards/accuracy_reward/mean": 0.4921875,
+      "rewards/accuracy_reward/std": 0.5019033551216125,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999260902404785,
+      "sampling/importance_sampling_ratio/min": 0.0002785924880299717,
+      "sampling/sampling_logp_difference/max": 8.185760498046875,
+      "sampling/sampling_logp_difference/mean": 0.019428331404924393,
+      "step": 397
+    },
+    {
+      "clip_ratio/high_max": 3.206032488378696e-06,
+      "clip_ratio/high_mean": 8.01508122094674e-07,
+      "clip_ratio/low_mean": 2.8814496317863814e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.9616004439958488e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16187.0,
+      "completions/mean_length": 6724.546875,
+      "completions/mean_terminated_length": 6571.22265625,
+      "completions/min_length": 588.0,
+      "completions/min_terminated_length": 588.0,
+      "entropy": 1.0110125690698624,
+      "epoch": 0.36614535418583255,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.001649077981710434,
+      "learning_rate": 1e-05,
+      "loss": 0.0525,
+      "num_tokens": 332166003.0,
+      "reward": 0.421875,
+      "reward_std": 0.21542152762413025,
+      "rewards/accuracy_reward/mean": 0.421875,
+      "rewards/accuracy_reward/std": 0.4957992732524872,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999189376831055,
+      "sampling/importance_sampling_ratio/min": 3.7501690712815616e-06,
+      "sampling/sampling_logp_difference/max": 12.493709564208984,
+      "sampling/sampling_logp_difference/mean": 0.020595930516719818,
+      "step": 398
+    },
+    {
+      "clip_ratio/high_max": 1.11491995085089e-05,
+      "clip_ratio/high_mean": 2.787299877127225e-06,
+      "clip_ratio/low_mean": 3.4109823332073574e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.689712332288764e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16096.0,
+      "completions/mean_length": 7833.546875,
+      "completions/mean_terminated_length": 7485.96728515625,
+      "completions/min_length": 1509.0,
+      "completions/min_terminated_length": 1509.0,
+      "entropy": 0.8942571505904198,
+      "epoch": 0.3670653173873045,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0010421582264825702,
+      "learning_rate": 1e-05,
+      "loss": 0.084,
+      "num_tokens": 333188785.0,
+      "reward": 0.328125,
+      "reward_std": 0.22567126154899597,
+      "rewards/accuracy_reward/mean": 0.328125,
+      "rewards/accuracy_reward/std": 0.4713755249977112,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999600648880005,
+      "sampling/importance_sampling_ratio/min": 0.0008163535967469215,
+      "sampling/sampling_logp_difference/max": 7.110662937164307,
+      "sampling/sampling_logp_difference/mean": 0.018777694553136826,
+      "step": 399
+    },
+    {
+      "clip_ratio/high_max": 1.0101967518494348e-05,
+      "clip_ratio/high_mean": 2.525491879623587e-06,
+      "clip_ratio/low_mean": 3.350823226355715e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.603372420002415e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15624.0,
+      "completions/max_terminated_length": 15624.0,
+      "completions/mean_length": 7194.96875,
+      "completions/mean_terminated_length": 7194.96875,
+      "completions/min_length": 183.0,
+      "completions/min_terminated_length": 183.0,
+      "entropy": 1.0446517765522003,
+      "epoch": 0.36798528058877644,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.002221160801127553,
+      "learning_rate": 1e-05,
+      "loss": 0.0284,
+      "num_tokens": 334128989.0,
+      "reward": 0.3671875,
+      "reward_std": 0.26826781034469604,
+      "rewards/accuracy_reward/mean": 0.3671875,
+      "rewards/accuracy_reward/std": 0.4839322865009308,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999954879283905,
+      "sampling/importance_sampling_ratio/min": 8.05134459369583e-06,
+      "sampling/sampling_logp_difference/max": 11.729671478271484,
+      "sampling/sampling_logp_difference/mean": 0.021122492849826813,
+      "step": 400
+    },
+    {
+      "clip_ratio/high_max": 5.990032605041051e-06,
+      "clip_ratio/high_mean": 1.4975081512602628e-06,
+      "clip_ratio/low_mean": 2.5873220806715835e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.737072884428926e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14241.0,
+      "completions/mean_length": 7037.875,
+      "completions/mean_terminated_length": 6657.951171875,
+      "completions/min_length": 810.0,
+      "completions/min_terminated_length": 810.0,
+      "entropy": 0.9549769386649132,
+      "epoch": 0.3689052437902484,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0030101474840193987,
+      "learning_rate": 1e-05,
+      "loss": 0.0467,
+      "num_tokens": 335047917.0,
+      "reward": 0.4375,
+      "reward_std": 0.2398776262998581,
+      "rewards/accuracy_reward/mean": 0.4375,
+      "rewards/accuracy_reward/std": 0.49802759289741516,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999676942825317,
+      "sampling/importance_sampling_ratio/min": 2.435619171592407e-05,
+      "sampling/sampling_logp_difference/max": 10.622724533081055,
+      "sampling/sampling_logp_difference/mean": 0.02049148827791214,
+      "step": 401
+    },
+    {
+      "clip_ratio/high_max": 8.082625754468609e-06,
+      "clip_ratio/high_mean": 2.020656438617152e-06,
+      "clip_ratio/low_mean": 3.1645918625144986e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.366657551850949e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16100.0,
+      "completions/mean_length": 7596.7890625,
+      "completions/mean_terminated_length": 7313.33056640625,
+      "completions/min_length": 285.0,
+      "completions/min_terminated_length": 285.0,
+      "entropy": 0.8307650238275528,
+      "epoch": 0.36982520699172033,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0016104152891784906,
+      "learning_rate": 1e-05,
+      "loss": 0.0627,
+      "num_tokens": 336042178.0,
+      "reward": 0.359375,
+      "reward_std": 0.27722427248954773,
+      "rewards/accuracy_reward/mean": 0.359375,
+      "rewards/accuracy_reward/std": 0.481702595949173,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999981164932251,
+      "sampling/importance_sampling_ratio/min": 0.007673877757042646,
+      "sampling/sampling_logp_difference/max": 4.869933128356934,
+      "sampling/sampling_logp_difference/mean": 0.019274067133665085,
+      "step": 402
+    },
+    {
+      "clip_ratio/high_max": 5.6481858337065205e-06,
+      "clip_ratio/high_mean": 1.4120464584266301e-06,
+      "clip_ratio/low_mean": 1.32123756202418e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 1.4624422078668431e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16134.0,
+      "completions/mean_length": 7060.34375,
+      "completions/mean_terminated_length": 6836.576171875,
+      "completions/min_length": 897.0,
+      "completions/min_terminated_length": 897.0,
+      "entropy": 1.0481776595115662,
+      "epoch": 0.37074517019319225,
+      "frac_reward_zero_std": 0.625,
+      "grad_norm": 0.0010739013087004423,
+      "learning_rate": 1e-05,
+      "loss": 0.0452,
+      "num_tokens": 336963318.0,
+      "reward": 0.328125,
+      "reward_std": 0.1733490228652954,
+      "rewards/accuracy_reward/mean": 0.328125,
+      "rewards/accuracy_reward/std": 0.4713755249977112,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.000027060508728,
+      "sampling/importance_sampling_ratio/min": 0.00011510718468343839,
+      "sampling/sampling_logp_difference/max": 9.069646835327148,
+      "sampling/sampling_logp_difference/mean": 0.02168721705675125,
+      "step": 403
+    },
+    {
+      "clip_ratio/high_max": 3.200204901077086e-06,
+      "clip_ratio/high_mean": 8.000512252692715e-07,
+      "clip_ratio/low_mean": 1.9099150676993304e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 1.9899201902262575e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16357.0,
+      "completions/mean_length": 7746.484375,
+      "completions/mean_terminated_length": 7609.38134765625,
+      "completions/min_length": 960.0,
+      "completions/min_terminated_length": 960.0,
+      "entropy": 1.0216905921697617,
+      "epoch": 0.3716651333946642,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.0016449482645839453,
+      "learning_rate": 1e-05,
+      "loss": 0.0255,
+      "num_tokens": 337972068.0,
+      "reward": 0.2421875,
+      "reward_std": 0.1830747127532959,
+      "rewards/accuracy_reward/mean": 0.2421875,
+      "rewards/accuracy_reward/std": 0.4300905168056488,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999643564224243,
+      "sampling/importance_sampling_ratio/min": 0.0006486645434051752,
+      "sampling/sampling_logp_difference/max": 7.34059476852417,
+      "sampling/sampling_logp_difference/mean": 0.021722178906202316,
+      "step": 404
+    },
+    {
+      "clip_ratio/high_max": 2.161643442377681e-05,
+      "clip_ratio/high_mean": 5.404108605944202e-06,
+      "clip_ratio/low_mean": 4.580058657666086e-05,
+      "clip_ratio/low_min": 4.674994215747574e-06,
+      "clip_ratio/region_mean": 5.120469540997874e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15784.0,
+      "completions/mean_length": 6913.3984375,
+      "completions/mean_terminated_length": 6686.1044921875,
+      "completions/min_length": 843.0,
+      "completions/min_terminated_length": 843.0,
+      "entropy": 0.9993953481316566,
+      "epoch": 0.37258509659613614,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.003412841120734811,
+      "learning_rate": 1e-05,
+      "loss": 0.0358,
+      "num_tokens": 338876663.0,
+      "reward": 0.46875,
+      "reward_std": 0.33797895908355713,
+      "rewards/accuracy_reward/mean": 0.46875,
+      "rewards/accuracy_reward/std": 0.5009832978248596,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999386668205261,
+      "sampling/importance_sampling_ratio/min": 0.00012468472414184362,
+      "sampling/sampling_logp_difference/max": 8.98972225189209,
+      "sampling/sampling_logp_difference/mean": 0.02173588052392006,
+      "step": 405
+    },
+    {
+      "clip_ratio/high_max": 1.074430110747926e-05,
+      "clip_ratio/high_mean": 3.5224193766225653e-06,
+      "clip_ratio/low_mean": 2.64205210100954e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.9942940273031127e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16192.0,
+      "completions/mean_length": 7588.6953125,
+      "completions/mean_terminated_length": 7377.6083984375,
+      "completions/min_length": 491.0,
+      "completions/min_terminated_length": 491.0,
+      "entropy": 1.1119055226445198,
+      "epoch": 0.3735050597976081,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.0015972270630300045,
+      "learning_rate": 1e-05,
+      "loss": -0.0047,
+      "num_tokens": 339871184.0,
+      "reward": 0.28125,
+      "reward_std": 0.1820138692855835,
+      "rewards/accuracy_reward/mean": 0.28125,
+      "rewards/accuracy_reward/std": 0.4513758420944214,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999430775642395,
+      "sampling/importance_sampling_ratio/min": 0.00015846571477595717,
+      "sampling/sampling_logp_difference/max": 8.749972343444824,
+      "sampling/sampling_logp_difference/mean": 0.022462764754891396,
+      "step": 406
+    },
+    {
+      "clip_ratio/high_max": 1.2445105085134855e-05,
+      "clip_ratio/high_mean": 3.111276271283714e-06,
+      "clip_ratio/low_mean": 4.525409747202502e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.836537357277848e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 16331.0,
+      "completions/max_terminated_length": 16331.0,
+      "completions/mean_length": 6522.4453125,
+      "completions/mean_terminated_length": 6522.4453125,
+      "completions/min_length": 872.0,
+      "completions/min_terminated_length": 872.0,
+      "entropy": 1.0155515000224113,
+      "epoch": 0.37442502299908004,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.002284019021317363,
+      "learning_rate": 1e-05,
+      "loss": 0.0607,
+      "num_tokens": 340725769.0,
+      "reward": 0.515625,
+      "reward_std": 0.28749164938926697,
+      "rewards/accuracy_reward/mean": 0.515625,
+      "rewards/accuracy_reward/std": 0.5017194747924805,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998612999916077,
+      "sampling/importance_sampling_ratio/min": 0.0008916885708458722,
+      "sampling/sampling_logp_difference/max": 7.022393703460693,
+      "sampling/sampling_logp_difference/mean": 0.02157575450837612,
+      "step": 407
+    },
+    {
+      "clip_ratio/high_max": 1.4456319377131877e-05,
+      "clip_ratio/high_mean": 3.614079844282969e-06,
+      "clip_ratio/low_mean": 2.7839718427458138e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.1453798442271363e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 13961.0,
+      "completions/mean_length": 6749.125,
+      "completions/mean_terminated_length": 6517.88818359375,
+      "completions/min_length": 1156.0,
+      "completions/min_terminated_length": 1156.0,
+      "entropy": 1.0721680670976639,
+      "epoch": 0.37534498620055196,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.0010391590185463428,
+      "learning_rate": 1e-05,
+      "loss": 0.0622,
+      "num_tokens": 341610881.0,
+      "reward": 0.3828125,
+      "reward_std": 0.1990984082221985,
+      "rewards/accuracy_reward/mean": 0.3828125,
+      "rewards/accuracy_reward/std": 0.4879830479621887,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999426007270813,
+      "sampling/importance_sampling_ratio/min": 0.00020901163225062191,
+      "sampling/sampling_logp_difference/max": 8.47312068939209,
+      "sampling/sampling_logp_difference/mean": 0.02200891077518463,
+      "step": 408
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 1.4307706237559614e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 1.4307706237559614e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16374.0,
+      "completions/mean_length": 7988.4375,
+      "completions/mean_terminated_length": 7647.154296875,
+      "completions/min_length": 923.0,
+      "completions/min_terminated_length": 923.0,
+      "entropy": 0.9933496564626694,
+      "epoch": 0.37626494940202393,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0022965834941715,
+      "learning_rate": 1e-05,
+      "loss": 0.0168,
+      "num_tokens": 342652897.0,
+      "reward": 0.328125,
+      "reward_std": 0.2459382861852646,
+      "rewards/accuracy_reward/mean": 0.328125,
+      "rewards/accuracy_reward/std": 0.4713755249977112,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999749660491943,
+      "sampling/importance_sampling_ratio/min": 0.0003876982373185456,
+      "sampling/sampling_logp_difference/max": 7.855283260345459,
+      "sampling/sampling_logp_difference/mean": 0.020454837009310722,
+      "step": 409
+    },
+    {
+      "clip_ratio/high_max": 6.58229714645131e-06,
+      "clip_ratio/high_mean": 1.6455742866128276e-06,
+      "clip_ratio/low_mean": 3.983285796493874e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.14784317399608e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15968.0,
+      "completions/mean_length": 7072.4140625,
+      "completions/mean_terminated_length": 6848.9365234375,
+      "completions/min_length": 442.0,
+      "completions/min_terminated_length": 442.0,
+      "entropy": 0.9560660421848297,
+      "epoch": 0.37718491260349585,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0027879721019417048,
+      "learning_rate": 1e-05,
+      "loss": 0.0501,
+      "num_tokens": 343578670.0,
+      "reward": 0.5234375,
+      "reward_std": 0.3043339252471924,
+      "rewards/accuracy_reward/mean": 0.5234375,
+      "rewards/accuracy_reward/std": 0.5014128684997559,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000234842300415,
+      "sampling/importance_sampling_ratio/min": 0.0001181035113404505,
+      "sampling/sampling_logp_difference/max": 9.043949127197266,
+      "sampling/sampling_logp_difference/mean": 0.021169768646359444,
+      "step": 410
+    },
+    {
+      "clip_ratio/high_max": 1.9136705304845236e-05,
+      "clip_ratio/high_mean": 4.784176326211309e-06,
+      "clip_ratio/low_mean": 2.449715702823596e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.928133335444727e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15768.0,
+      "completions/mean_length": 6586.515625,
+      "completions/mean_terminated_length": 6270.4677734375,
+      "completions/min_length": 613.0,
+      "completions/min_terminated_length": 613.0,
+      "entropy": 0.893077902495861,
+      "epoch": 0.3781048758049678,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0016929456032812595,
+      "learning_rate": 1e-05,
+      "loss": 0.0457,
+      "num_tokens": 344441080.0,
+      "reward": 0.4765625,
+      "reward_std": 0.20175683498382568,
+      "rewards/accuracy_reward/mean": 0.4765625,
+      "rewards/accuracy_reward/std": 0.5014128684997559,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999939799308777,
+      "sampling/importance_sampling_ratio/min": 0.013895876705646515,
+      "sampling/sampling_logp_difference/max": 4.276163101196289,
+      "sampling/sampling_logp_difference/mean": 0.019590143114328384,
+      "step": 411
+    },
+    {
+      "clip_ratio/high_max": 1.2621936093637487e-05,
+      "clip_ratio/high_mean": 3.1554840234093717e-06,
+      "clip_ratio/low_mean": 5.4418370382336434e-05,
+      "clip_ratio/low_min": 1.5258214943969506e-05,
+      "clip_ratio/region_mean": 5.7573854519432643e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15765.0,
+      "completions/mean_length": 7903.296875,
+      "completions/mean_terminated_length": 7629.7255859375,
+      "completions/min_length": 1820.0,
+      "completions/min_terminated_length": 1820.0,
+      "entropy": 0.943502850830555,
+      "epoch": 0.37902483900643974,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0024831818882375956,
+      "learning_rate": 1e-05,
+      "loss": 0.0638,
+      "num_tokens": 345472414.0,
+      "reward": 0.4765625,
+      "reward_std": 0.3243142366409302,
+      "rewards/accuracy_reward/mean": 0.4765625,
+      "rewards/accuracy_reward/std": 0.5014128684997559,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999592900276184,
+      "sampling/importance_sampling_ratio/min": 0.0009350833133794367,
+      "sampling/sampling_logp_difference/max": 6.974874973297119,
+      "sampling/sampling_logp_difference/mean": 0.020601853728294373,
+      "step": 412
+    },
+    {
+      "clip_ratio/high_max": 2.738965622484102e-05,
+      "clip_ratio/high_mean": 9.173523380923143e-06,
+      "clip_ratio/low_mean": 2.9159931841604703e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.8333455336214683e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15211.0,
+      "completions/mean_length": 7016.890625,
+      "completions/mean_terminated_length": 6943.1337890625,
+      "completions/min_length": 5.0,
+      "completions/min_terminated_length": 5.0,
+      "entropy": 0.9670446068048477,
+      "epoch": 0.37994480220791166,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0032182165887206793,
+      "learning_rate": 1e-05,
+      "loss": 0.0724,
+      "num_tokens": 346388112.0,
+      "reward": 0.421875,
+      "reward_std": 0.3306122422218323,
+      "rewards/accuracy_reward/mean": 0.421875,
+      "rewards/accuracy_reward/std": 0.4957992732524872,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999998807907104,
+      "sampling/importance_sampling_ratio/min": 0.000258272688370198,
+      "sampling/sampling_logp_difference/max": 8.261494636535645,
+      "sampling/sampling_logp_difference/mean": 0.020366424694657326,
+      "step": 413
+    },
+    {
+      "clip_ratio/high_max": 6.399099220288917e-06,
+      "clip_ratio/high_mean": 1.5997748050722294e-06,
+      "clip_ratio/low_mean": 2.1530643095957203e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.3130417901029432e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16038.0,
+      "completions/mean_length": 7043.8046875,
+      "completions/mean_terminated_length": 6819.64013671875,
+      "completions/min_length": 1331.0,
+      "completions/min_terminated_length": 1331.0,
+      "entropy": 1.022966854274273,
+      "epoch": 0.38086476540938363,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0023274575360119343,
+      "learning_rate": 1e-05,
+      "loss": 0.0724,
+      "num_tokens": 347312071.0,
+      "reward": 0.3671875,
+      "reward_std": 0.29538238048553467,
+      "rewards/accuracy_reward/mean": 0.3671875,
+      "rewards/accuracy_reward/std": 0.4839322865009308,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999589323997498,
+      "sampling/importance_sampling_ratio/min": 0.0013508512638509274,
+      "sampling/sampling_logp_difference/max": 6.607020378112793,
+      "sampling/sampling_logp_difference/mean": 0.021443769335746765,
+      "step": 414
+    },
+    {
+      "clip_ratio/high_max": 1.896051571748103e-05,
+      "clip_ratio/high_mean": 4.7401289293702575e-06,
+      "clip_ratio/low_mean": 2.3596727601216116e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.83368563032127e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14908.0,
+      "completions/mean_length": 6475.6484375,
+      "completions/mean_terminated_length": 6318.37353515625,
+      "completions/min_length": 506.0,
+      "completions/min_terminated_length": 506.0,
+      "entropy": 0.9873237758874893,
+      "epoch": 0.38178472861085555,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0008460046374239028,
+      "learning_rate": 1e-05,
+      "loss": 0.0639,
+      "num_tokens": 348161394.0,
+      "reward": 0.4375,
+      "reward_std": 0.22620806097984314,
+      "rewards/accuracy_reward/mean": 0.4375,
+      "rewards/accuracy_reward/std": 0.49802759289741516,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999297261238098,
+      "sampling/importance_sampling_ratio/min": 0.012015017680823803,
+      "sampling/sampling_logp_difference/max": 4.421597957611084,
+      "sampling/sampling_logp_difference/mean": 0.019627809524536133,
+      "step": 415
+    },
+    {
+      "clip_ratio/high_max": 1.9873371229550685e-05,
+      "clip_ratio/high_mean": 4.968342807387671e-06,
+      "clip_ratio/low_mean": 4.485099543671822e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.981933852832299e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16243.0,
+      "completions/mean_length": 8038.90625,
+      "completions/mean_terminated_length": 7699.67431640625,
+      "completions/min_length": 802.0,
+      "completions/min_terminated_length": 802.0,
+      "entropy": 0.9513615965843201,
+      "epoch": 0.3827046918123275,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0017075197538360953,
+      "learning_rate": 1e-05,
+      "loss": 0.0758,
+      "num_tokens": 349211078.0,
+      "reward": 0.328125,
+      "reward_std": 0.30221715569496155,
+      "rewards/accuracy_reward/mean": 0.328125,
+      "rewards/accuracy_reward/std": 0.4713755249977112,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.000017523765564,
+      "sampling/importance_sampling_ratio/min": 0.0001345122145721689,
+      "sampling/sampling_logp_difference/max": 8.91385555267334,
+      "sampling/sampling_logp_difference/mean": 0.020795777440071106,
+      "step": 416
+    },
+    {
+      "clip_ratio/high_max": 3.976459538534982e-06,
+      "clip_ratio/high_mean": 9.941148846337455e-07,
+      "clip_ratio/low_mean": 4.385826059660758e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.485237468543346e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16304.0,
+      "completions/mean_length": 7339.3046875,
+      "completions/mean_terminated_length": 7122.232421875,
+      "completions/min_length": 1002.0,
+      "completions/min_terminated_length": 1002.0,
+      "entropy": 0.9872350245714188,
+      "epoch": 0.38362465501379944,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0016014629509299994,
+      "learning_rate": 1e-05,
+      "loss": 0.0609,
+      "num_tokens": 350171613.0,
+      "reward": 0.4453125,
+      "reward_std": 0.2698703408241272,
+      "rewards/accuracy_reward/mean": 0.4453125,
+      "rewards/accuracy_reward/std": 0.4989531338214874,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999157190322876,
+      "sampling/importance_sampling_ratio/min": 1.3763129302901689e-08,
+      "sampling/sampling_logp_difference/max": 18.101272583007812,
+      "sampling/sampling_logp_difference/mean": 0.021187925711274147,
+      "step": 417
+    },
+    {
+      "clip_ratio/high_max": 9.294637948187301e-06,
+      "clip_ratio/high_mean": 2.3236594870468252e-06,
+      "clip_ratio/low_mean": 2.512099752038921e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.7444657121122873e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15341.0,
+      "completions/mean_length": 7239.359375,
+      "completions/mean_terminated_length": 7094.20654296875,
+      "completions/min_length": 1294.0,
+      "completions/min_terminated_length": 1294.0,
+      "entropy": 0.9430425837635994,
+      "epoch": 0.3845446182152714,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0025373264215886593,
+      "learning_rate": 1e-05,
+      "loss": 0.0038,
+      "num_tokens": 351116803.0,
+      "reward": 0.5234375,
+      "reward_std": 0.24671243131160736,
+      "rewards/accuracy_reward/mean": 0.5234375,
+      "rewards/accuracy_reward/std": 0.5014128684997559,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999785423278809,
+      "sampling/importance_sampling_ratio/min": 0.014291372150182724,
+      "sampling/sampling_logp_difference/max": 4.248099327087402,
+      "sampling/sampling_logp_difference/mean": 0.019912682473659515,
+      "step": 418
+    },
+    {
+      "clip_ratio/high_max": 1.5709408671682468e-05,
+      "clip_ratio/high_mean": 5.310340270625602e-06,
+      "clip_ratio/low_mean": 3.522799016764111e-05,
+      "clip_ratio/low_min": 6.063465662009548e-06,
+      "clip_ratio/region_mean": 4.053833055195355e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15833.0,
+      "completions/mean_length": 7211.7421875,
+      "completions/mean_terminated_length": 7066.1513671875,
+      "completions/min_length": 209.0,
+      "completions/min_terminated_length": 209.0,
+      "entropy": 0.841051459312439,
+      "epoch": 0.38546458141674333,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.002612616401165724,
+      "learning_rate": 1e-05,
+      "loss": 0.1042,
+      "num_tokens": 352059034.0,
+      "reward": 0.625,
+      "reward_std": 0.327729195356369,
+      "rewards/accuracy_reward/mean": 0.625,
+      "rewards/accuracy_reward/std": 0.4860251843929291,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999889731407166,
+      "sampling/importance_sampling_ratio/min": 2.5700239802972646e-06,
+      "sampling/sampling_logp_difference/max": 12.87159538269043,
+      "sampling/sampling_logp_difference/mean": 0.01921844482421875,
+      "step": 419
+    },
+    {
+      "clip_ratio/high_max": 7.196444812507252e-06,
+      "clip_ratio/high_mean": 1.799111203126813e-06,
+      "clip_ratio/low_mean": 1.714175300548959e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 1.894086381071247e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14390.0,
+      "completions/mean_length": 6374.6953125,
+      "completions/mean_terminated_length": 6295.8818359375,
+      "completions/min_length": 693.0,
+      "completions/min_terminated_length": 693.0,
+      "entropy": 1.0578313246369362,
+      "epoch": 0.38638454461821525,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.0019967984408140182,
+      "learning_rate": 1e-05,
+      "loss": 0.0363,
+      "num_tokens": 352896219.0,
+      "reward": 0.359375,
+      "reward_std": 0.19438526034355164,
+      "rewards/accuracy_reward/mean": 0.359375,
+      "rewards/accuracy_reward/std": 0.481702595949173,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999984860420227,
+      "sampling/importance_sampling_ratio/min": 0.020119966939091682,
+      "sampling/sampling_logp_difference/max": 4.295470237731934,
+      "sampling/sampling_logp_difference/mean": 0.02013232931494713,
+      "step": 420
+    },
+    {
+      "clip_ratio/high_max": 3.095712781941984e-05,
+      "clip_ratio/high_mean": 7.73928195485496e-06,
+      "clip_ratio/low_mean": 4.0026389058311906e-05,
+      "clip_ratio/low_min": 8.968050451585441e-06,
+      "clip_ratio/region_mean": 4.7765669989985327e-05,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16313.0,
+      "completions/mean_length": 7274.7109375,
+      "completions/mean_terminated_length": 6667.42529296875,
+      "completions/min_length": 1191.0,
+      "completions/min_terminated_length": 1191.0,
+      "entropy": 0.7415856420993805,
+      "epoch": 0.3873045078196872,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0018279170617461205,
+      "learning_rate": 1e-05,
+      "loss": 0.067,
+      "num_tokens": 353844990.0,
+      "reward": 0.53125,
+      "reward_std": 0.29696235060691833,
+      "rewards/accuracy_reward/mean": 0.53125,
+      "rewards/accuracy_reward/std": 0.5009832978248596,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998984336853027,
+      "sampling/importance_sampling_ratio/min": 0.00019450874242465943,
+      "sampling/sampling_logp_difference/max": 8.54503345489502,
+      "sampling/sampling_logp_difference/mean": 0.017373956739902496,
+      "step": 421
+    },
+    {
+      "clip_ratio/high_max": 1.3592496998171555e-05,
+      "clip_ratio/high_mean": 3.3981242495428887e-06,
+      "clip_ratio/low_mean": 4.277909783922951e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.6177221065590857e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15676.0,
+      "completions/mean_length": 7902.9296875,
+      "completions/mean_terminated_length": 7836.1494140625,
+      "completions/min_length": 339.0,
+      "completions/min_terminated_length": 339.0,
+      "entropy": 1.0019611343741417,
+      "epoch": 0.38822447102115915,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.001777544734068215,
+      "learning_rate": 1e-05,
+      "loss": 0.0534,
+      "num_tokens": 354873933.0,
+      "reward": 0.3125,
+      "reward_std": 0.2835350036621094,
+      "rewards/accuracy_reward/mean": 0.3125,
+      "rewards/accuracy_reward/std": 0.4653336703777313,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999098777770996,
+      "sampling/importance_sampling_ratio/min": 0.001003989833407104,
+      "sampling/sampling_logp_difference/max": 6.903773307800293,
+      "sampling/sampling_logp_difference/mean": 0.021197015419602394,
+      "step": 422
+    },
+    {
+      "clip_ratio/high_max": 2.524704336792638e-05,
+      "clip_ratio/high_mean": 7.122522617919458e-06,
+      "clip_ratio/low_mean": 2.635721989463491e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.347974279677146e-05,
+      "completions/clipped_ratio": 0.0703125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14700.0,
+      "completions/mean_length": 7304.046875,
+      "completions/mean_terminated_length": 6617.328125,
+      "completions/min_length": 487.0,
+      "completions/min_terminated_length": 487.0,
+      "entropy": 0.8584602400660515,
+      "epoch": 0.3891444342226311,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.00153827341273427,
+      "learning_rate": 1e-05,
+      "loss": 0.0813,
+      "num_tokens": 355829507.0,
+      "reward": 0.3671875,
+      "reward_std": 0.2982654273509979,
+      "rewards/accuracy_reward/mean": 0.3671875,
+      "rewards/accuracy_reward/std": 0.4839322865009308,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999763369560242,
+      "sampling/importance_sampling_ratio/min": 3.820072379312478e-05,
+      "sampling/sampling_logp_difference/max": 10.172656059265137,
+      "sampling/sampling_logp_difference/mean": 0.019642215222120285,
+      "step": 423
+    },
+    {
+      "clip_ratio/high_max": 5.025731752539286e-06,
+      "clip_ratio/high_mean": 1.2564329381348216e-06,
+      "clip_ratio/low_mean": 3.204466929673799e-05,
+      "clip_ratio/low_min": 3.388819550309563e-06,
+      "clip_ratio/region_mean": 3.330110212118598e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15074.0,
+      "completions/mean_length": 5677.21875,
+      "completions/mean_terminated_length": 5507.27001953125,
+      "completions/min_length": 76.0,
+      "completions/min_terminated_length": 76.0,
+      "entropy": 1.0159753635525703,
+      "epoch": 0.39006439742410304,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.002107275417074561,
+      "learning_rate": 1e-05,
+      "loss": 0.0311,
+      "num_tokens": 356573231.0,
+      "reward": 0.5,
+      "reward_std": 0.25354719161987305,
+      "rewards/accuracy_reward/mean": 0.5,
+      "rewards/accuracy_reward/std": 0.5019646286964417,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999579191207886,
+      "sampling/importance_sampling_ratio/min": 0.0019436449510976672,
+      "sampling/sampling_logp_difference/max": 6.243190288543701,
+      "sampling/sampling_logp_difference/mean": 0.020722679793834686,
+      "step": 424
+    },
+    {
+      "clip_ratio/high_max": 1.4743651718163164e-05,
+      "clip_ratio/high_mean": 3.685912929540791e-06,
+      "clip_ratio/low_mean": 1.6582721229951858e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.0268634500553162e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15407.0,
+      "completions/max_terminated_length": 15407.0,
+      "completions/mean_length": 6209.078125,
+      "completions/mean_terminated_length": 6209.078125,
+      "completions/min_length": 723.0,
+      "completions/min_terminated_length": 723.0,
+      "entropy": 0.8867508247494698,
+      "epoch": 0.39098436062557496,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.001720887958072126,
+      "learning_rate": 1e-05,
+      "loss": 0.0828,
+      "num_tokens": 357387169.0,
+      "reward": 0.5703125,
+      "reward_std": 0.2409384548664093,
+      "rewards/accuracy_reward/mean": 0.5703125,
+      "rewards/accuracy_reward/std": 0.4969765841960907,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.000000238418579,
+      "sampling/importance_sampling_ratio/min": 4.222915777063463e-06,
+      "sampling/sampling_logp_difference/max": 12.374984741210938,
+      "sampling/sampling_logp_difference/mean": 0.017990771681070328,
+      "step": 425
+    },
+    {
+      "clip_ratio/high_max": 7.870049557823222e-06,
+      "clip_ratio/high_mean": 1.9675123894558055e-06,
+      "clip_ratio/low_mean": 1.6993449889923795e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 1.89609622793796e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15845.0,
+      "completions/mean_length": 7183.3671875,
+      "completions/mean_terminated_length": 6962.55224609375,
+      "completions/min_length": 375.0,
+      "completions/min_terminated_length": 375.0,
+      "entropy": 0.9918289259076118,
+      "epoch": 0.39190432382704693,
+      "frac_reward_zero_std": 0.625,
+      "grad_norm": 0.0012448625639081001,
+      "learning_rate": 1e-05,
+      "loss": 0.0853,
+      "num_tokens": 358334584.0,
+      "reward": 0.328125,
+      "reward_std": 0.17464719712734222,
+      "rewards/accuracy_reward/mean": 0.328125,
+      "rewards/accuracy_reward/std": 0.4713755249977112,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999426007270813,
+      "sampling/importance_sampling_ratio/min": 0.00038028976996429265,
+      "sampling/sampling_logp_difference/max": 7.874577045440674,
+      "sampling/sampling_logp_difference/mean": 0.020646382123231888,
+      "step": 426
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 2.9313079608073167e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.9313079608073167e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15106.0,
+      "completions/mean_length": 6877.1484375,
+      "completions/mean_terminated_length": 6802.29150390625,
+      "completions/min_length": 2027.0,
+      "completions/min_terminated_length": 2027.0,
+      "entropy": 0.8806835636496544,
+      "epoch": 0.39282428702851885,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.001519464422017336,
+      "learning_rate": 1e-05,
+      "loss": 0.0686,
+      "num_tokens": 359233451.0,
+      "reward": 0.375,
+      "reward_std": 0.19438527524471283,
+      "rewards/accuracy_reward/mean": 0.375,
+      "rewards/accuracy_reward/std": 0.4860251843929291,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998905658721924,
+      "sampling/importance_sampling_ratio/min": 0.008662254549562931,
+      "sampling/sampling_logp_difference/max": 4.748780250549316,
+      "sampling/sampling_logp_difference/mean": 0.01951739378273487,
+      "step": 427
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 4.164141705587099e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.164141705587099e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 16061.0,
+      "completions/max_terminated_length": 16061.0,
+      "completions/mean_length": 6964.6875,
+      "completions/mean_terminated_length": 6964.6875,
+      "completions/min_length": 1148.0,
+      "completions/min_terminated_length": 1148.0,
+      "entropy": 0.8069597631692886,
+      "epoch": 0.3937442502299908,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0022954042069613934,
+      "learning_rate": 1e-05,
+      "loss": 0.1217,
+      "num_tokens": 360143003.0,
+      "reward": 0.53125,
+      "reward_std": 0.3253750801086426,
+      "rewards/accuracy_reward/mean": 0.53125,
+      "rewards/accuracy_reward/std": 0.5009832978248596,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999755620956421,
+      "sampling/importance_sampling_ratio/min": 0.00020347768440842628,
+      "sampling/sampling_logp_difference/max": 8.499954223632812,
+      "sampling/sampling_logp_difference/mean": 0.01880607008934021,
+      "step": 428
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 1.7779158497432945e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 1.7779158497432945e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16123.0,
+      "completions/mean_length": 7322.0,
+      "completions/mean_terminated_length": 7178.1591796875,
+      "completions/min_length": 901.0,
+      "completions/min_terminated_length": 901.0,
+      "entropy": 1.0852478593587875,
+      "epoch": 0.39466421343146274,
+      "frac_reward_zero_std": 0.625,
+      "grad_norm": 0.0021376016084104776,
+      "learning_rate": 1e-05,
+      "loss": 0.0094,
+      "num_tokens": 361101379.0,
+      "reward": 0.3046875,
+      "reward_std": 0.15308690071105957,
+      "rewards/accuracy_reward/mean": 0.3046875,
+      "rewards/accuracy_reward/std": 0.46208351850509644,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.000006079673767,
+      "sampling/importance_sampling_ratio/min": 0.00011516757513163611,
+      "sampling/sampling_logp_difference/max": 9.069122314453125,
+      "sampling/sampling_logp_difference/mean": 0.021568164229393005,
+      "step": 429
+    },
+    {
+      "clip_ratio/high_max": 3.1260904052032856e-05,
+      "clip_ratio/high_mean": 8.905177651286067e-06,
+      "clip_ratio/low_mean": 4.4633561628870666e-05,
+      "clip_ratio/low_min": 4.338168764661532e-06,
+      "clip_ratio/region_mean": 5.353873848434887e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15880.0,
+      "completions/mean_length": 7027.0078125,
+      "completions/mean_terminated_length": 6646.64208984375,
+      "completions/min_length": 967.0,
+      "completions/min_terminated_length": 967.0,
+      "entropy": 0.8932972475886345,
+      "epoch": 0.39558417663293466,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0031003563199192286,
+      "learning_rate": 1e-05,
+      "loss": 0.0875,
+      "num_tokens": 362018284.0,
+      "reward": 0.5,
+      "reward_std": 0.3243093490600586,
+      "rewards/accuracy_reward/mean": 0.5,
+      "rewards/accuracy_reward/std": 0.5019646286964417,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999800324440002,
+      "sampling/importance_sampling_ratio/min": 0.0010351726086810231,
+      "sampling/sampling_logp_difference/max": 6.873187065124512,
+      "sampling/sampling_logp_difference/mean": 0.020102323964238167,
+      "step": 430
+    },
+    {
+      "clip_ratio/high_max": 1.5146189525694354e-05,
+      "clip_ratio/high_mean": 4.871089572588971e-06,
+      "clip_ratio/low_mean": 4.263560651907028e-05,
+      "clip_ratio/low_min": 1.8708525658439612e-05,
+      "clip_ratio/region_mean": 4.7506695409538224e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15430.0,
+      "completions/mean_length": 6341.7421875,
+      "completions/mean_terminated_length": 6262.66943359375,
+      "completions/min_length": 939.0,
+      "completions/min_terminated_length": 939.0,
+      "entropy": 0.885854922235012,
+      "epoch": 0.39650413983440663,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0018177316524088383,
+      "learning_rate": 1e-05,
+      "loss": 0.0701,
+      "num_tokens": 362851107.0,
+      "reward": 0.5234375,
+      "reward_std": 0.28171277046203613,
+      "rewards/accuracy_reward/mean": 0.5234375,
+      "rewards/accuracy_reward/std": 0.5014128684997559,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999906063079834,
+      "sampling/importance_sampling_ratio/min": 0.0005522234132513404,
+      "sampling/sampling_logp_difference/max": 7.50155782699585,
+      "sampling/sampling_logp_difference/mean": 0.020463842898607254,
+      "step": 431
+    },
+    {
+      "clip_ratio/high_max": 1.9989562133559957e-05,
+      "clip_ratio/high_mean": 5.9246351611363934e-06,
+      "clip_ratio/low_mean": 3.242748857701372e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.835212419289746e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 12672.0,
+      "completions/mean_length": 6388.875,
+      "completions/mean_terminated_length": 6310.17333984375,
+      "completions/min_length": 220.0,
+      "completions/min_terminated_length": 220.0,
+      "entropy": 0.9593783840537071,
+      "epoch": 0.39742410303587855,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.001520519028417766,
+      "learning_rate": 1e-05,
+      "loss": 0.0503,
+      "num_tokens": 363691019.0,
+      "reward": 0.328125,
+      "reward_std": 0.2972046136856079,
+      "rewards/accuracy_reward/mean": 0.328125,
+      "rewards/accuracy_reward/std": 0.4713755249977112,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000056028366089,
+      "sampling/importance_sampling_ratio/min": 0.0011127673787996173,
+      "sampling/sampling_logp_difference/max": 6.800905227661133,
+      "sampling/sampling_logp_difference/mean": 0.019675832241773605,
+      "step": 432
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 2.4561562668168335e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.4561562668168335e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15240.0,
+      "completions/mean_length": 7559.875,
+      "completions/mean_terminated_length": 7125.9013671875,
+      "completions/min_length": 1292.0,
+      "completions/min_terminated_length": 1292.0,
+      "entropy": 0.8298296853899956,
+      "epoch": 0.3983440662373505,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0016424815403297544,
+      "learning_rate": 1e-05,
+      "loss": 0.026,
+      "num_tokens": 364679475.0,
+      "reward": 0.4765625,
+      "reward_std": 0.2409384697675705,
+      "rewards/accuracy_reward/mean": 0.4765625,
+      "rewards/accuracy_reward/std": 0.5014128684997559,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000078678131104,
+      "sampling/importance_sampling_ratio/min": 0.00026268011424690485,
+      "sampling/sampling_logp_difference/max": 8.244573593139648,
+      "sampling/sampling_logp_difference/mean": 0.01943236216902733,
+      "step": 433
+    },
+    {
+      "clip_ratio/high_max": 9.62110971158836e-06,
+      "clip_ratio/high_mean": 2.40527742789709e-06,
+      "clip_ratio/low_mean": 3.785217859331169e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.025745568014827e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16162.0,
+      "completions/mean_length": 5993.1328125,
+      "completions/mean_terminated_length": 5743.75244140625,
+      "completions/min_length": 461.0,
+      "completions/min_terminated_length": 461.0,
+      "entropy": 0.9428447484970093,
+      "epoch": 0.39926402943882244,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0020216500852257013,
+      "learning_rate": 1e-05,
+      "loss": 0.0383,
+      "num_tokens": 365464588.0,
+      "reward": 0.5078125,
+      "reward_std": 0.28353995084762573,
+      "rewards/accuracy_reward/mean": 0.5078125,
+      "rewards/accuracy_reward/std": 0.5019033551216125,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999315142631531,
+      "sampling/importance_sampling_ratio/min": 0.0006411138456314802,
+      "sampling/sampling_logp_difference/max": 7.352303504943848,
+      "sampling/sampling_logp_difference/mean": 0.0196966715157032,
+      "step": 434
+    },
+    {
+      "clip_ratio/high_max": 1.3527967894333415e-05,
+      "clip_ratio/high_mean": 3.3819919735833537e-06,
+      "clip_ratio/low_mean": 2.5303937945864163e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.8685930146821192e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15118.0,
+      "completions/mean_length": 5325.0390625,
+      "completions/mean_terminated_length": 5149.50048828125,
+      "completions/min_length": 530.0,
+      "completions/min_terminated_length": 530.0,
+      "entropy": 0.7730643972754478,
+      "epoch": 0.40018399264029436,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0017544744769111276,
+      "learning_rate": 1e-05,
+      "loss": 0.0816,
+      "num_tokens": 366167481.0,
+      "reward": 0.671875,
+      "reward_std": 0.30091896653175354,
+      "rewards/accuracy_reward/mean": 0.671875,
+      "rewards/accuracy_reward/std": 0.4713755249977112,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.000058889389038,
+      "sampling/importance_sampling_ratio/min": 0.0008072834461927414,
+      "sampling/sampling_logp_difference/max": 7.121835708618164,
+      "sampling/sampling_logp_difference/mean": 0.01736798696219921,
+      "step": 435
+    },
+    {
+      "clip_ratio/high_max": 8.82370454746706e-06,
+      "clip_ratio/high_mean": 3.1566120810566645e-06,
+      "clip_ratio/low_mean": 2.7905126785299217e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.1061739150572976e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15321.0,
+      "completions/mean_length": 7331.3359375,
+      "completions/mean_terminated_length": 7114.072265625,
+      "completions/min_length": 1160.0,
+      "completions/min_terminated_length": 1160.0,
+      "entropy": 0.9418040588498116,
+      "epoch": 0.40110395584176634,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0013123912503942847,
+      "learning_rate": 1e-05,
+      "loss": 0.0445,
+      "num_tokens": 367126948.0,
+      "reward": 0.453125,
+      "reward_std": 0.3243093490600586,
+      "rewards/accuracy_reward/mean": 0.453125,
+      "rewards/accuracy_reward/std": 0.4997538626194,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999157786369324,
+      "sampling/importance_sampling_ratio/min": 0.0023285788483917713,
+      "sampling/sampling_logp_difference/max": 6.062497138977051,
+      "sampling/sampling_logp_difference/mean": 0.020918458700180054,
+      "step": 436
+    },
+    {
+      "clip_ratio/high_max": 1.6637134194752434e-05,
+      "clip_ratio/high_mean": 4.1592835486881086e-06,
+      "clip_ratio/low_mean": 4.105965246026244e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.521893566789004e-05,
+      "completions/clipped_ratio": 0.0546875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16209.0,
+      "completions/mean_length": 7217.25,
+      "completions/mean_terminated_length": 6686.94189453125,
+      "completions/min_length": 867.0,
+      "completions/min_terminated_length": 867.0,
+      "entropy": 0.9499563127756119,
+      "epoch": 0.40202391904323825,
+      "frac_reward_zero_std": 0.125,
+      "grad_norm": 0.0021181178744882345,
+      "learning_rate": 1e-05,
+      "loss": 0.0656,
+      "num_tokens": 368071772.0,
+      "reward": 0.453125,
+      "reward_std": 0.3593195080757141,
+      "rewards/accuracy_reward/mean": 0.453125,
+      "rewards/accuracy_reward/std": 0.4997538626194,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998823404312134,
+      "sampling/importance_sampling_ratio/min": 0.000675773830153048,
+      "sampling/sampling_logp_difference/max": 7.299652099609375,
+      "sampling/sampling_logp_difference/mean": 0.020650038495659828,
+      "step": 437
+    },
+    {
+      "clip_ratio/high_max": 1.2043050901411334e-05,
+      "clip_ratio/high_mean": 3.0107627253528335e-06,
+      "clip_ratio/low_mean": 3.4911336570075946e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.792209963648929e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15669.0,
+      "completions/mean_length": 7549.796875,
+      "completions/mean_terminated_length": 7264.822265625,
+      "completions/min_length": 1037.0,
+      "completions/min_terminated_length": 1037.0,
+      "entropy": 1.0309365764260292,
+      "epoch": 0.4029438822447102,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0019147706916555762,
+      "learning_rate": 1e-05,
+      "loss": 0.0159,
+      "num_tokens": 369055650.0,
+      "reward": 0.359375,
+      "reward_std": 0.27328526973724365,
+      "rewards/accuracy_reward/mean": 0.359375,
+      "rewards/accuracy_reward/std": 0.481702595949173,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999600648880005,
+      "sampling/importance_sampling_ratio/min": 0.0010221411939710379,
+      "sampling/sampling_logp_difference/max": 6.885855674743652,
+      "sampling/sampling_logp_difference/mean": 0.02183394506573677,
+      "step": 438
+    },
+    {
+      "clip_ratio/high_max": 3.9433421079593245e-06,
+      "clip_ratio/high_mean": 9.858355269898311e-07,
+      "clip_ratio/low_mean": 3.9529069113086734e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.051490452638973e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15269.0,
+      "completions/mean_length": 6727.53125,
+      "completions/mean_terminated_length": 6651.49609375,
+      "completions/min_length": 1157.0,
+      "completions/min_terminated_length": 1157.0,
+      "entropy": 0.9676288217306137,
+      "epoch": 0.40386384544618215,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0031485585495829582,
+      "learning_rate": 1e-05,
+      "loss": 0.095,
+      "num_tokens": 369938574.0,
+      "reward": 0.4765625,
+      "reward_std": 0.3306073546409607,
+      "rewards/accuracy_reward/mean": 0.4765625,
+      "rewards/accuracy_reward/std": 0.5014128684997559,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000512599945068,
+      "sampling/importance_sampling_ratio/min": 0.000710509717464447,
+      "sampling/sampling_logp_difference/max": 7.249527931213379,
+      "sampling/sampling_logp_difference/mean": 0.020127974450588226,
+      "step": 439
+    },
+    {
+      "clip_ratio/high_max": 1.0043262818726362e-05,
+      "clip_ratio/high_mean": 2.5108157046815904e-06,
+      "clip_ratio/low_mean": 3.8503443363424594e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.101425872704567e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15864.0,
+      "completions/mean_length": 6281.7265625,
+      "completions/mean_terminated_length": 5955.8466796875,
+      "completions/min_length": 672.0,
+      "completions/min_terminated_length": 672.0,
+      "entropy": 0.9817835092544556,
+      "epoch": 0.4047838086476541,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.003415121464058757,
+      "learning_rate": 1e-05,
+      "loss": 0.0332,
+      "num_tokens": 370760459.0,
+      "reward": 0.375,
+      "reward_std": 0.2688094973564148,
+      "rewards/accuracy_reward/mean": 0.375,
+      "rewards/accuracy_reward/std": 0.4860251843929291,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999826550483704,
+      "sampling/importance_sampling_ratio/min": 4.502153956309485e-07,
+      "sampling/sampling_logp_difference/max": 14.613539695739746,
+      "sampling/sampling_logp_difference/mean": 0.02063862606883049,
+      "step": 440
+    },
+    {
+      "clip_ratio/high_max": 2.3593061087012757e-05,
+      "clip_ratio/high_mean": 7.003677183092805e-06,
+      "clip_ratio/low_mean": 1.8947657395074202e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.5951335032914358e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16272.0,
+      "completions/mean_length": 6765.71875,
+      "completions/mean_terminated_length": 6689.984375,
+      "completions/min_length": 1444.0,
+      "completions/min_terminated_length": 1444.0,
+      "entropy": 1.0270514711737633,
+      "epoch": 0.40570377184912604,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.00214037811383605,
+      "learning_rate": 1e-05,
+      "loss": 0.0366,
+      "num_tokens": 371649103.0,
+      "reward": 0.4765625,
+      "reward_std": 0.1830746978521347,
+      "rewards/accuracy_reward/mean": 0.4765625,
+      "rewards/accuracy_reward/std": 0.5014128684997559,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999620318412781,
+      "sampling/importance_sampling_ratio/min": 0.001930466154590249,
+      "sampling/sampling_logp_difference/max": 6.249993801116943,
+      "sampling/sampling_logp_difference/mean": 0.02172943949699402,
+      "step": 441
+    },
+    {
+      "clip_ratio/high_max": 2.1009727788623422e-05,
+      "clip_ratio/high_mean": 6.259035217226483e-06,
+      "clip_ratio/low_mean": 5.011202529203729e-05,
+      "clip_ratio/low_min": 3.1568047234031837e-06,
+      "clip_ratio/region_mean": 5.637106050926377e-05,
+      "completions/clipped_ratio": 0.0546875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16159.0,
+      "completions/mean_length": 7481.625,
+      "completions/mean_terminated_length": 6966.611328125,
+      "completions/min_length": 782.0,
+      "completions/min_terminated_length": 782.0,
+      "entropy": 0.9730701074004173,
+      "epoch": 0.40662373505059796,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.003510556183755398,
+      "learning_rate": 1e-05,
+      "loss": 0.0022,
+      "num_tokens": 372624535.0,
+      "reward": 0.4140625,
+      "reward_std": 0.3464162349700928,
+      "rewards/accuracy_reward/mean": 0.4140625,
+      "rewards/accuracy_reward/std": 0.49449479579925537,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999492168426514,
+      "sampling/importance_sampling_ratio/min": 0.0003729084855876863,
+      "sampling/sampling_logp_difference/max": 7.894177436828613,
+      "sampling/sampling_logp_difference/mean": 0.02149931713938713,
+      "step": 442
+    },
+    {
+      "clip_ratio/high_max": 2.8992230909352656e-06,
+      "clip_ratio/high_mean": 7.248057727338164e-07,
+      "clip_ratio/low_mean": 3.781230475397024e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.853711018564354e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15994.0,
+      "completions/mean_length": 8474.2109375,
+      "completions/mean_terminated_length": 8152.67431640625,
+      "completions/min_length": 983.0,
+      "completions/min_terminated_length": 983.0,
+      "entropy": 0.9761426225304604,
+      "epoch": 0.40754369825206993,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0018307552672922611,
+      "learning_rate": 1e-05,
+      "loss": 0.062,
+      "num_tokens": 373732962.0,
+      "reward": 0.328125,
+      "reward_std": 0.3214184641838074,
+      "rewards/accuracy_reward/mean": 0.328125,
+      "rewards/accuracy_reward/std": 0.4713755249977112,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999135732650757,
+      "sampling/importance_sampling_ratio/min": 1.6381112288854638e-07,
+      "sampling/sampling_logp_difference/max": 15.624551773071289,
+      "sampling/sampling_logp_difference/mean": 0.02121492102742195,
+      "step": 443
+    },
+    {
+      "clip_ratio/high_max": 7.689794983889442e-06,
+      "clip_ratio/high_mean": 1.9224487459723605e-06,
+      "clip_ratio/low_mean": 4.332422963670979e-05,
+      "clip_ratio/low_min": 5.504910404852126e-06,
+      "clip_ratio/region_mean": 4.5246677473187447e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15117.0,
+      "completions/max_terminated_length": 15117.0,
+      "completions/mean_length": 7433.953125,
+      "completions/mean_terminated_length": 7433.953125,
+      "completions/min_length": 1108.0,
+      "completions/min_terminated_length": 1108.0,
+      "entropy": 1.0665365010499954,
+      "epoch": 0.40846366145354185,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0021801323164254427,
+      "learning_rate": 1e-05,
+      "loss": -0.0046,
+      "num_tokens": 374706548.0,
+      "reward": 0.421875,
+      "reward_std": 0.2590789198875427,
+      "rewards/accuracy_reward/mean": 0.421875,
+      "rewards/accuracy_reward/std": 0.4957992732524872,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999595880508423,
+      "sampling/importance_sampling_ratio/min": 1.2762369294705422e-07,
+      "sampling/sampling_logp_difference/max": 15.87417984008789,
+      "sampling/sampling_logp_difference/mean": 0.022046178579330444,
+      "step": 444
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 2.7811285235657124e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.7811285235657124e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15989.0,
+      "completions/mean_length": 7646.7265625,
+      "completions/mean_terminated_length": 7217.0244140625,
+      "completions/min_length": 1019.0,
+      "completions/min_terminated_length": 1019.0,
+      "entropy": 0.9308071210980415,
+      "epoch": 0.4093836246550138,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.0014674996491521597,
+      "learning_rate": 1e-05,
+      "loss": 0.0225,
+      "num_tokens": 375706673.0,
+      "reward": 0.328125,
+      "reward_std": 0.1820138692855835,
+      "rewards/accuracy_reward/mean": 0.328125,
+      "rewards/accuracy_reward/std": 0.4713755249977112,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000466108322144,
+      "sampling/importance_sampling_ratio/min": 2.4991354621306527e-06,
+      "sampling/sampling_logp_difference/max": 12.899565696716309,
+      "sampling/sampling_logp_difference/mean": 0.018912145867943764,
+      "step": 445
+    },
+    {
+      "clip_ratio/high_max": 2.569714251876576e-05,
+      "clip_ratio/high_mean": 6.42428562969144e-06,
+      "clip_ratio/low_mean": 2.5548037910994026e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.1972323540685466e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15082.0,
+      "completions/mean_length": 6046.46875,
+      "completions/mean_terminated_length": 5965.07080078125,
+      "completions/min_length": 305.0,
+      "completions/min_terminated_length": 305.0,
+      "entropy": 0.9040833190083504,
+      "epoch": 0.41030358785648574,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.002320521976798773,
+      "learning_rate": 1e-05,
+      "loss": 0.0489,
+      "num_tokens": 376506613.0,
+      "reward": 0.515625,
+      "reward_std": 0.2835350036621094,
+      "rewards/accuracy_reward/mean": 0.515625,
+      "rewards/accuracy_reward/std": 0.5017194747924805,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999364614486694,
+      "sampling/importance_sampling_ratio/min": 2.462414704496041e-05,
+      "sampling/sampling_logp_difference/max": 10.611783027648926,
+      "sampling/sampling_logp_difference/mean": 0.018557455390691757,
+      "step": 446
+    },
+    {
+      "clip_ratio/high_max": 1.3460261698128306e-05,
+      "clip_ratio/high_mean": 4.301844171550329e-06,
+      "clip_ratio/low_mean": 5.543450777167891e-05,
+      "clip_ratio/low_min": 1.7309802160525578e-05,
+      "clip_ratio/region_mean": 5.973635086320428e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16110.0,
+      "completions/mean_length": 7435.53125,
+      "completions/mean_terminated_length": 7220.7685546875,
+      "completions/min_length": 268.0,
+      "completions/min_terminated_length": 268.0,
+      "entropy": 1.0237125977873802,
+      "epoch": 0.41122355105795766,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.004084885586053133,
+      "learning_rate": 1e-05,
+      "loss": 0.1085,
+      "num_tokens": 377476249.0,
+      "reward": 0.3984375,
+      "reward_std": 0.29538238048553467,
+      "rewards/accuracy_reward/mean": 0.3984375,
+      "rewards/accuracy_reward/std": 0.4915000796318054,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999234676361084,
+      "sampling/importance_sampling_ratio/min": 0.0008572525111958385,
+      "sampling/sampling_logp_difference/max": 7.0617780685424805,
+      "sampling/sampling_logp_difference/mean": 0.02096719481050968,
+      "step": 447
+    },
+    {
+      "clip_ratio/high_max": 2.7470227905723732e-05,
+      "clip_ratio/high_mean": 8.26576740564633e-06,
+      "clip_ratio/low_mean": 3.730497360265872e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.557074043987086e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16351.0,
+      "completions/mean_length": 7228.1953125,
+      "completions/mean_terminated_length": 7008.45654296875,
+      "completions/min_length": 501.0,
+      "completions/min_terminated_length": 501.0,
+      "entropy": 0.9323876351118088,
+      "epoch": 0.41214351425942963,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0025359690189361572,
+      "learning_rate": 1e-05,
+      "loss": 0.1291,
+      "num_tokens": 378423338.0,
+      "reward": 0.5078125,
+      "reward_std": 0.3182457685470581,
+      "rewards/accuracy_reward/mean": 0.5078125,
+      "rewards/accuracy_reward/std": 0.5019033551216125,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000035762786865,
+      "sampling/importance_sampling_ratio/min": 0.00026116587105207145,
+      "sampling/sampling_logp_difference/max": 8.250354766845703,
+      "sampling/sampling_logp_difference/mean": 0.020272942259907722,
+      "step": 448
+    }
+  ],
+  "logging_steps": 1,
+  "max_steps": 1024,
+  "num_input_tokens_seen": 378423338,
+  "num_train_epochs": 1,
+  "save_steps": 64,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 0.0,
+  "train_batch_size": 4,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/dapo_lora_plus_20251202_001141/checkpoint-448/zero_to_fp32.py b/dapo_lora_plus_20251202_001141/checkpoint-448/zero_to_fp32.py
new file mode 100644
index 0000000000000000000000000000000000000000..5995d6e6f04e43b989587aa9022a3aef0c66d694
--- /dev/null
+++ b/dapo_lora_plus_20251202_001141/checkpoint-448/zero_to_fp32.py
@@ -0,0 +1,760 @@
+#!/usr/bin/env python
+
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets
+# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in
+# the future. Once extracted, the weights don't require DeepSpeed and can be used in any
+# application.
+#
+# example:
+#   python zero_to_fp32.py . output_dir/
+#   or
+#   python zero_to_fp32.py . output_dir/ --safe_serialization
+
+import argparse
+import torch
+import glob
+import math
+import os
+import re
+import gc
+import json
+import numpy as np
+from tqdm import tqdm
+from collections import OrderedDict
+from dataclasses import dataclass
+
+# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with
+# DeepSpeed data structures it has to be available in the current python environment.
+from deepspeed.utils import logger
+from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS,
+                                            FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES,
+                                            FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS)
+
+
+@dataclass
+class zero_model_state:
+    buffers: dict()
+    param_shapes: dict()
+    shared_params: list
+    ds_version: int
+    frozen_param_shapes: dict()
+    frozen_param_fragments: dict()
+
+
+debug = 0
+
+# load to cpu
+device = torch.device('cpu')
+
+
+def atoi(text):
+    return int(text) if text.isdigit() else text
+
+
+def natural_keys(text):
+    '''
+    alist.sort(key=natural_keys) sorts in human order
+    http://nedbatchelder.com/blog/200712/human_sorting.html
+    (See Toothy's implementation in the comments)
+    '''
+    return [atoi(c) for c in re.split(r'(\d+)', text)]
+
+
+def get_model_state_file(checkpoint_dir, zero_stage):
+    if not os.path.isdir(checkpoint_dir):
+        raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist")
+
+    # there should be only one file
+    if zero_stage <= 2:
+        file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt")
+    elif zero_stage == 3:
+        file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt")
+
+    if not os.path.exists(file):
+        raise FileNotFoundError(f"can't find model states file at '{file}'")
+
+    return file
+
+
+def get_checkpoint_files(checkpoint_dir, glob_pattern):
+    # XXX: need to test that this simple glob rule works for multi-node setup too
+    ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys)
+
+    if len(ckpt_files) == 0:
+        raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'")
+
+    return ckpt_files
+
+
+def get_optim_files(checkpoint_dir):
+    return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt")
+
+
+def get_model_state_files(checkpoint_dir):
+    return get_checkpoint_files(checkpoint_dir, "*_model_states.pt")
+
+
+def parse_model_states(files):
+    zero_model_states = []
+    for file in files:
+        state_dict = torch.load(file, map_location=device, weights_only=False)
+
+        if BUFFER_NAMES not in state_dict:
+            raise ValueError(f"{file} is not a model state checkpoint")
+        buffer_names = state_dict[BUFFER_NAMES]
+        if debug:
+            print("Found buffers:", buffer_names)
+
+        # recover just the buffers while restoring them to fp32 if they were saved in fp16
+        buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names}
+        param_shapes = state_dict[PARAM_SHAPES]
+
+        # collect parameters that are included in param_shapes
+        param_names = []
+        for s in param_shapes:
+            for name in s.keys():
+                param_names.append(name)
+
+        # update with frozen parameters
+        frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None)
+        if frozen_param_shapes is not None:
+            if debug:
+                print(f"Found frozen_param_shapes: {frozen_param_shapes}")
+            param_names += list(frozen_param_shapes.keys())
+
+        # handle shared params
+        shared_params = [[k, v] for k, v in state_dict["shared_params"].items()]
+
+        ds_version = state_dict.get(DS_VERSION, None)
+
+        frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None)
+
+        z_model_state = zero_model_state(buffers=buffers,
+                                         param_shapes=param_shapes,
+                                         shared_params=shared_params,
+                                         ds_version=ds_version,
+                                         frozen_param_shapes=frozen_param_shapes,
+                                         frozen_param_fragments=frozen_param_fragments)
+        zero_model_states.append(z_model_state)
+
+    return zero_model_states
+
+
+def parse_optim_states(files, ds_checkpoint_dir):
+    total_files = len(files)
+    state_dicts = []
+    for f in tqdm(files, desc='Loading checkpoint shards'):
+        state_dict = torch.load(f, map_location=device, mmap=True, weights_only=False)
+        # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights
+        # and also handle the case where it was already removed by another helper script
+        state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None)
+        state_dicts.append(state_dict)
+
+    if ZERO_STAGE not in state_dicts[0][OPTIMIZER_STATE_DICT]:
+        raise ValueError(f"{files[0]} is not a zero checkpoint")
+    zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE]
+    world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT]
+
+    # For ZeRO-2 each param group can have different partition_count as data parallelism for expert
+    # parameters can be different from data parallelism for non-expert parameters. So we can just
+    # use the max of the partition_count to get the dp world_size.
+
+    if type(world_size) is list:
+        world_size = max(world_size)
+
+    if world_size != total_files:
+        raise ValueError(
+            f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. "
+            "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes."
+        )
+
+    # the groups are named differently in each stage
+    if zero_stage <= 2:
+        fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS
+    elif zero_stage == 3:
+        fp32_groups_key = FP32_FLAT_GROUPS
+    else:
+        raise ValueError(f"unknown zero stage {zero_stage}")
+
+    fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))]
+    return zero_stage, world_size, fp32_flat_groups
+
+
+def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters):
+    """
+    Returns fp32 state_dict reconstructed from ds checkpoint
+
+    Args:
+        - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are)
+
+    """
+    print(f"Processing zero checkpoint '{ds_checkpoint_dir}'")
+
+    optim_files = get_optim_files(ds_checkpoint_dir)
+    zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir)
+    print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}")
+
+    model_files = get_model_state_files(ds_checkpoint_dir)
+
+    zero_model_states = parse_model_states(model_files)
+    print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}')
+
+    if zero_stage <= 2:
+        return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+                                                          exclude_frozen_parameters)
+    elif zero_stage == 3:
+        return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+                                                          exclude_frozen_parameters)
+
+
+def _zero2_merge_frozen_params(state_dict, zero_model_states):
+    if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0:
+        return
+
+    frozen_param_shapes = zero_model_states[0].frozen_param_shapes
+    frozen_param_fragments = zero_model_states[0].frozen_param_fragments
+
+    if debug:
+        num_elem = sum(s.numel() for s in frozen_param_shapes.values())
+        print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}')
+
+        wanted_params = len(frozen_param_shapes)
+        wanted_numel = sum(s.numel() for s in frozen_param_shapes.values())
+        avail_numel = sum([p.numel() for p in frozen_param_fragments.values()])
+        print(f'Frozen params: Have {avail_numel} numels to process.')
+        print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params')
+
+    total_params = 0
+    total_numel = 0
+    for name, shape in frozen_param_shapes.items():
+        total_params += 1
+        unpartitioned_numel = shape.numel()
+        total_numel += unpartitioned_numel
+
+        state_dict[name] = frozen_param_fragments[name]
+
+        if debug:
+            print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ")
+
+    print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements")
+
+
+def _has_callable(obj, fn):
+    attr = getattr(obj, fn, None)
+    return callable(attr)
+
+
+def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states):
+    param_shapes = zero_model_states[0].param_shapes
+
+    # Reconstruction protocol:
+    #
+    # XXX: document this
+
+    if debug:
+        for i in range(world_size):
+            for j in range(len(fp32_flat_groups[0])):
+                print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}")
+
+    # XXX: memory usage doubles here (zero2)
+    num_param_groups = len(fp32_flat_groups[0])
+    merged_single_partition_of_fp32_groups = []
+    for i in range(num_param_groups):
+        merged_partitions = [sd[i] for sd in fp32_flat_groups]
+        full_single_fp32_vector = torch.cat(merged_partitions, 0)
+        merged_single_partition_of_fp32_groups.append(full_single_fp32_vector)
+    avail_numel = sum(
+        [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups])
+
+    if debug:
+        wanted_params = sum([len(shapes) for shapes in param_shapes])
+        wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes])
+        # not asserting if there is a mismatch due to possible padding
+        print(f"Have {avail_numel} numels to process.")
+        print(f"Need {wanted_numel} numels in {wanted_params} params.")
+
+    # params
+    # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support
+    # out-of-core computing solution
+    total_numel = 0
+    total_params = 0
+    for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups):
+        offset = 0
+        avail_numel = full_single_fp32_vector.numel()
+        for name, shape in shapes.items():
+
+            unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape)
+            total_numel += unpartitioned_numel
+            total_params += 1
+
+            if debug:
+                print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ")
+            state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape)
+            offset += unpartitioned_numel
+
+        # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and
+        # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex
+        # paddings performed in the code it's almost impossible to predict the exact numbers w/o the
+        # live optimizer object, so we are checking that the numbers are within the right range
+        align_to = 2 * world_size
+
+        def zero2_align(x):
+            return align_to * math.ceil(x / align_to)
+
+        if debug:
+            print(f"original offset={offset}, avail_numel={avail_numel}")
+
+        offset = zero2_align(offset)
+        avail_numel = zero2_align(avail_numel)
+
+        if debug:
+            print(f"aligned  offset={offset}, avail_numel={avail_numel}")
+
+        # Sanity check
+        if offset != avail_numel:
+            raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong")
+
+    print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements")
+
+
+def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+                                               exclude_frozen_parameters):
+    state_dict = OrderedDict()
+
+    # buffers
+    buffers = zero_model_states[0].buffers
+    state_dict.update(buffers)
+    if debug:
+        print(f"added {len(buffers)} buffers")
+
+    if not exclude_frozen_parameters:
+        _zero2_merge_frozen_params(state_dict, zero_model_states)
+
+    _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states)
+
+    # recover shared parameters
+    for pair in zero_model_states[0].shared_params:
+        if pair[1] in state_dict:
+            state_dict[pair[0]] = state_dict[pair[1]]
+
+    return state_dict
+
+
+def zero3_partitioned_param_info(unpartitioned_numel, world_size):
+    remainder = unpartitioned_numel % world_size
+    padding_numel = (world_size - remainder) if remainder else 0
+    partitioned_numel = math.ceil(unpartitioned_numel / world_size)
+    return partitioned_numel, padding_numel
+
+
+def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states):
+    if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0:
+        return
+
+    if debug:
+        for i in range(world_size):
+            num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values())
+            print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}')
+
+        frozen_param_shapes = zero_model_states[0].frozen_param_shapes
+        wanted_params = len(frozen_param_shapes)
+        wanted_numel = sum(s.numel() for s in frozen_param_shapes.values())
+        avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size
+        print(f'Frozen params: Have {avail_numel} numels to process.')
+        print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params')
+
+    total_params = 0
+    total_numel = 0
+    for name, shape in zero_model_states[0].frozen_param_shapes.items():
+        total_params += 1
+        unpartitioned_numel = shape.numel()
+        total_numel += unpartitioned_numel
+
+        param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states)
+        state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape)
+
+        partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size)
+
+        if debug:
+            print(
+                f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}"
+            )
+
+    print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements")
+
+
+class GatheredTensor:
+    """
+    A pseudo tensor that collects partitioned weights.
+    It is more memory efficient when there are multiple groups.
+    """
+
+    def __init__(self, flat_groups, flat_groups_offset, offset, partitioned_numel, shape):
+        self.flat_groups = flat_groups
+        self.flat_groups_offset = flat_groups_offset
+        self.offset = offset
+        self.partitioned_numel = partitioned_numel
+        self.shape = shape
+        self.dtype = self.flat_groups[0][0].dtype
+
+    def contiguous(self):
+        """
+        Merge partitioned weights from flat_groups into a single tensor.
+        """
+        end_idx = self.offset + self.partitioned_numel
+        world_size = len(self.flat_groups)
+        pad_flat_param_chunks = []
+
+        for rank_i in range(world_size):
+            # for each rank, we need to collect weights from related group/groups
+            flat_groups_at_rank_i = self.flat_groups[rank_i]
+            start_group_id = None
+            end_group_id = None
+            for group_id in range(len(self.flat_groups_offset)):
+                if self.flat_groups_offset[group_id] <= self.offset < self.flat_groups_offset[group_id + 1]:
+                    start_group_id = group_id
+                if self.flat_groups_offset[group_id] < end_idx <= self.flat_groups_offset[group_id + 1]:
+                    end_group_id = group_id
+                    break
+            # collect weights from related group/groups
+            for group_id in range(start_group_id, end_group_id + 1):
+                flat_tensor = flat_groups_at_rank_i[group_id]
+                start_offset = self.offset - self.flat_groups_offset[group_id]
+                end_offset = min(end_idx, self.flat_groups_offset[group_id + 1]) - self.flat_groups_offset[group_id]
+                pad_flat_param_chunks.append(flat_tensor[start_offset:end_offset])
+
+        # collect weights from all ranks
+        pad_flat_param = torch.cat(pad_flat_param_chunks, dim=0)
+        param = pad_flat_param[:self.shape.numel()].view(self.shape).contiguous()
+        return param
+
+
+def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states):
+    param_shapes = zero_model_states[0].param_shapes
+    avail_numel = sum([flat_group.numel() for flat_group in fp32_flat_groups[0]]) * world_size
+
+    # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each
+    # param, re-consolidating each param, while dealing with padding if any
+
+    # merge list of dicts, preserving order
+    param_shapes = {k: v for d in param_shapes for k, v in d.items()}
+
+    if debug:
+        for i in range(world_size):
+            print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}")
+
+        wanted_params = len(param_shapes)
+        wanted_numel = sum(shape.numel() for shape in param_shapes.values())
+        # not asserting if there is a mismatch due to possible padding
+        avail_numel = fp32_flat_groups[0].numel() * world_size
+        print(f"Trainable params: Have {avail_numel} numels to process.")
+        print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.")
+
+    # params
+    # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support
+    # out-of-core computing solution
+    offset = 0
+    total_numel = 0
+    total_params = 0
+    flat_groups_offset = [0] + list(np.cumsum([flat_tensor.numel() for flat_tensor in fp32_flat_groups[0]]))
+    for name, shape in tqdm(param_shapes.items(), desc='Gathering sharded weights'):
+        unpartitioned_numel = shape.numel()
+        total_numel += unpartitioned_numel
+        total_params += 1
+        partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size)
+
+        if debug:
+            print(
+                f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}"
+            )
+
+        # memory efficient tensor
+        tensor = GatheredTensor(fp32_flat_groups, flat_groups_offset, offset, partitioned_numel, shape)
+        state_dict[name] = tensor
+        offset += partitioned_numel
+
+    offset *= world_size
+
+    # Sanity check
+    if offset != avail_numel:
+        raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong")
+
+    print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements")
+
+
+def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+                                               exclude_frozen_parameters):
+    state_dict = OrderedDict()
+
+    # buffers
+    buffers = zero_model_states[0].buffers
+    state_dict.update(buffers)
+    if debug:
+        print(f"added {len(buffers)} buffers")
+
+    if not exclude_frozen_parameters:
+        _zero3_merge_frozen_params(state_dict, world_size, zero_model_states)
+
+    _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states)
+
+    # recover shared parameters
+    for pair in zero_model_states[0].shared_params:
+        if pair[1] in state_dict:
+            state_dict[pair[0]] = state_dict[pair[1]]
+
+    return state_dict
+
+
+def to_torch_tensor(state_dict, return_empty_tensor=False):
+    """
+    Convert state_dict of GatheredTensor to torch tensor
+    """
+    torch_state_dict = {}
+    converted_tensors = {}
+    for name, tensor in state_dict.items():
+        tensor_id = id(tensor)
+        if tensor_id in converted_tensors:  # shared tensors
+            shared_tensor = torch_state_dict[converted_tensors[tensor_id]]
+            torch_state_dict[name] = shared_tensor
+        else:
+            converted_tensors[tensor_id] = name
+            if return_empty_tensor:
+                torch_state_dict[name] = torch.empty(tensor.shape, dtype=tensor.dtype)
+            else:
+                torch_state_dict[name] = tensor.contiguous()
+    return torch_state_dict
+
+
+def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir,
+                                             tag=None,
+                                             exclude_frozen_parameters=False,
+                                             lazy_mode=False):
+    """
+    Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with
+    ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example
+    via a model hub.
+
+    Args:
+        - ``checkpoint_dir``: path to the desired checkpoint folder
+        - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14``
+        - ``exclude_frozen_parameters``: exclude frozen parameters
+        - ``lazy_mode``: get state_dict in lazy mode. It returns a dict of pesduo tensor instead of torch tensor, which is more memory efficient.
+          Convert the pesduo tensor to torch tensor by ``.contiguous()``
+
+    Returns:
+        - pytorch ``state_dict``
+
+    A typical usage might be ::
+
+        from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint
+        # do the training and checkpoint saving
+        state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu
+        model = model.cpu() # move to cpu
+        model.load_state_dict(state_dict)
+        # submit to model hub or save the model to share with others
+
+    In this example the ``model`` will no longer be usable in the deepspeed context of the same
+    application. i.e. you will need to re-initialize the deepspeed engine, since
+    ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it.
+
+    If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead.
+
+    Note: the above usage may not work if your application doesn't have sufficient free CPU memory.
+    You may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with
+    the checkpoint. Or you can load state_dict in lazy mode ::
+
+        from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint
+        state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, lazy_mode=True) # not on cpu
+        for name, lazy_tensor in state_dict.item():
+            tensor = lazy_tensor.contiguous()  # to cpu
+            print(name, tensor)
+            # del tensor to release memory if it no longer in use
+    """
+    if tag is None:
+        latest_path = os.path.join(checkpoint_dir, 'latest')
+        if os.path.isfile(latest_path):
+            with open(latest_path, 'r') as fd:
+                tag = fd.read().strip()
+        else:
+            raise ValueError(f"Unable to find 'latest' file at {latest_path}")
+
+    ds_checkpoint_dir = os.path.join(checkpoint_dir, tag)
+
+    if not os.path.isdir(ds_checkpoint_dir):
+        raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist")
+
+    state_dict = _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters)
+    if lazy_mode:
+        return state_dict
+    else:
+        return to_torch_tensor(state_dict)
+
+
+def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir,
+                                               output_dir,
+                                               max_shard_size="5GB",
+                                               safe_serialization=False,
+                                               tag=None,
+                                               exclude_frozen_parameters=False):
+    """
+    Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be
+    loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed.
+
+    Args:
+        - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``)
+        - ``output_dir``: directory to the pytorch fp32 state_dict output files
+        - ``max_shard_size``: the maximum size for a checkpoint before being sharded, default value is 5GB
+        - ``safe_serialization``:  whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).
+        - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14``
+        - ``exclude_frozen_parameters``: exclude frozen parameters
+    """
+
+    # Dependency pre-check
+    if safe_serialization:
+        try:
+            from safetensors.torch import save_file
+        except ImportError:
+            print('If you want to use `safe_serialization`, please `pip install safetensors`')
+            raise
+    if max_shard_size is not None:
+        try:
+            from huggingface_hub import split_torch_state_dict_into_shards
+        except ImportError:
+            print('If you want to use `max_shard_size`, please `pip install huggingface_hub`')
+            raise
+
+    # Convert zero checkpoint to state_dict
+    state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir,
+                                                          tag,
+                                                          exclude_frozen_parameters,
+                                                          lazy_mode=True)
+
+    # Shard the model if it is too big.
+    weights_name = "model.safetensors" if safe_serialization else "pytorch_model.bin"
+    if max_shard_size is not None:
+        filename_pattern = weights_name.replace(".bin", "{suffix}.bin").replace(".safetensors", "{suffix}.safetensors")
+        # an memory-efficient approach for sharding
+        empty_state_dict = to_torch_tensor(state_dict, return_empty_tensor=True)
+        state_dict_split = split_torch_state_dict_into_shards(empty_state_dict,
+                                                              filename_pattern=filename_pattern,
+                                                              max_shard_size=max_shard_size)
+    else:
+        from collections import namedtuple
+        StateDictSplit = namedtuple("StateDictSplit", ["is_sharded", "filename_to_tensors"])
+        state_dict_split = StateDictSplit(is_sharded=False,
+                                          filename_to_tensors={weights_name: list(state_dict.keys())})
+
+    # Save the model by shard
+    os.makedirs(output_dir, exist_ok=True)
+    filename_to_tensors = state_dict_split.filename_to_tensors.items()
+    for shard_file, tensors in tqdm(filename_to_tensors, desc="Saving checkpoint shards"):
+        shard_state_dict = {tensor_name: state_dict[tensor_name] for tensor_name in tensors}
+        shard_state_dict = to_torch_tensor(shard_state_dict)
+        output_path = os.path.join(output_dir, shard_file)
+        if safe_serialization:
+            save_file(shard_state_dict, output_path, metadata={"format": "pt"})
+        else:
+            torch.save(shard_state_dict, output_path)
+        # release the memory of current shard
+        for tensor_name in list(shard_state_dict.keys()):
+            del state_dict[tensor_name]
+            del shard_state_dict[tensor_name]
+        del shard_state_dict
+        gc.collect()
+
+    # Save index if sharded
+    if state_dict_split.is_sharded:
+        index = {
+            "metadata": state_dict_split.metadata,
+            "weight_map": state_dict_split.tensor_to_filename,
+        }
+        save_index_file = "model.safetensors.index.json" if safe_serialization else "pytorch_model.bin.index.json"
+        save_index_file = os.path.join(output_dir, save_index_file)
+        with open(save_index_file, "w", encoding="utf-8") as f:
+            content = json.dumps(index, indent=2, sort_keys=True) + "\n"
+            f.write(content)
+
+
+def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None):
+    """
+    1. Put the provided model to cpu
+    2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict``
+    3. Load it into the provided model
+
+    Args:
+        - ``model``: the model object to update
+        - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``)
+        - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14``
+
+    Returns:
+        - ``model`: modified model
+
+    Make sure you have plenty of CPU memory available before you call this function. If you don't
+    have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it
+    conveniently placed for you in the checkpoint folder.
+
+    A typical usage might be ::
+
+        from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint
+        model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir)
+        # submit to model hub or save the model to share with others
+
+    Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context
+    of the same application. i.e. you will need to re-initialize the deepspeed engine, since
+    ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it.
+
+    """
+    logger.info("Extracting fp32 weights")
+    state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag)
+
+    logger.info("Overwriting model with fp32 weights")
+    model = model.cpu()
+    model.load_state_dict(state_dict, strict=False)
+
+    return model
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("checkpoint_dir",
+                        type=str,
+                        help="path to the desired checkpoint folder, e.g., path/checkpoint-12")
+    parser.add_argument("output_dir",
+                        type=str,
+                        help="directory to the pytorch fp32 state_dict output files"
+                        "(e.g. path/checkpoint-12-output/)")
+    parser.add_argument(
+        "--max_shard_size",
+        type=str,
+        default="5GB",
+        help="The maximum size for a checkpoint before being sharded. Checkpoints shard will then be each of size"
+        "lower than this size. If expressed as a string, needs to be digits followed by a unit (like `5MB`"
+        "We default it to 5GB in order for models to be able to run easily on free-tier google colab instances"
+        "without CPU OOM issues.")
+    parser.add_argument(
+        "--safe_serialization",
+        default=False,
+        action='store_true',
+        help="Whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).")
+    parser.add_argument("-t",
+                        "--tag",
+                        type=str,
+                        default=None,
+                        help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1")
+    parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters")
+    parser.add_argument("-d", "--debug", action='store_true', help="enable debug")
+    args = parser.parse_args()
+
+    debug = args.debug
+
+    convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir,
+                                               args.output_dir,
+                                               max_shard_size=args.max_shard_size,
+                                               safe_serialization=args.safe_serialization,
+                                               tag=args.tag,
+                                               exclude_frozen_parameters=args.exclude_frozen_parameters)
diff --git a/dapo_lora_plus_20251202_001141/checkpoint-64/README.md b/dapo_lora_plus_20251202_001141/checkpoint-64/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..b3fac4aca7a7fabb3a0972e6c9281e23853e2816
--- /dev/null
+++ b/dapo_lora_plus_20251202_001141/checkpoint-64/README.md
@@ -0,0 +1,209 @@
+---
+base_model: deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B
+- grpo
+- lora
+- transformers
+- trl
+---
+
+# Model Card for Model ID
+
+<!-- Provide a quick summary of what the model is/does. -->
+
+
+
+## Model Details
+
+### Model Description
+
+<!-- Provide a longer summary of what this model is. -->
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+<!-- Provide the basic links for the model. -->
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+
+### Direct Use
+
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+
+[More Information Needed]
+
+### Recommendations
+
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+
+[More Information Needed]
+
+### Training Procedure
+
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+
+#### Speeds, Sizes, Times [optional]
+
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+
+[More Information Needed]
+
+## Evaluation
+
+<!-- This section describes the evaluation protocols and provides the results. -->
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+<!-- This should link to a Dataset Card if possible. -->
+
+[More Information Needed]
+
+#### Factors
+
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+
+[More Information Needed]
+
+#### Metrics
+
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+<!-- Relevant interpretability work for the model goes here -->
+
+[More Information Needed]
+
+## Environmental Impact
+
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.17.1
\ No newline at end of file
diff --git a/dapo_lora_plus_20251202_001141/checkpoint-64/adapter_config.json b/dapo_lora_plus_20251202_001141/checkpoint-64/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..57b1340e85011632bb78b2fd3b13b455f6b0d622
--- /dev/null
+++ b/dapo_lora_plus_20251202_001141/checkpoint-64/adapter_config.json
@@ -0,0 +1,42 @@
+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B",
+  "bias": "none",
+  "corda_config": null,
+  "eva_config": null,
+  "exclude_modules": null,
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 64,
+  "lora_bias": false,
+  "lora_dropout": 0.05,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "qalora_group_size": 16,
+  "r": 32,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "q_proj",
+    "v_proj",
+    "k_proj",
+    "gate_proj",
+    "down_proj",
+    "up_proj",
+    "o_proj"
+  ],
+  "target_parameters": null,
+  "task_type": "CAUSAL_LM",
+  "trainable_token_indices": null,
+  "use_dora": false,
+  "use_qalora": false,
+  "use_rslora": false
+}
\ No newline at end of file
diff --git a/dapo_lora_plus_20251202_001141/checkpoint-64/chat_template.jinja b/dapo_lora_plus_20251202_001141/checkpoint-64/chat_template.jinja
new file mode 100644
index 0000000000000000000000000000000000000000..c2066bd7391c270626e39c9d7124f00360126412
--- /dev/null
+++ b/dapo_lora_plus_20251202_001141/checkpoint-64/chat_template.jinja
@@ -0,0 +1 @@
+{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% set ns = namespace(is_first=false, is_tool=false, is_output_first=true, system_prompt='') %}{%- for message in messages %}{%- if message['role'] == 'system' %}{% set ns.system_prompt = message['content'] %}{%- endif %}{%- endfor %}{{bos_token}}{{ns.system_prompt}}{%- for message in messages %}{%- if message['role'] == 'user' %}{%- set ns.is_tool = false -%}{{'<｜User｜>' + message['content']}}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is none %}{%- set ns.is_tool = false -%}{%- for tool in message['tool_calls']%}{%- if not ns.is_first %}{{'<｜Assistant｜><｜tool▁calls▁begin｜><｜tool▁call▁begin｜>' + tool['type'] + '<｜tool▁sep｜>' + tool['function']['name'] + '\n' + '```json' + '\n' + tool['function']['arguments'] + '\n' + '```' + '<｜tool▁call▁end｜>'}}{%- set ns.is_first = true -%}{%- else %}{{'\n' + '<｜tool▁call▁begin｜>' + tool['type'] + '<｜tool▁sep｜>' + tool['function']['name'] + '\n' + '```json' + '\n' + tool['function']['arguments'] + '\n' + '```' + '<｜tool▁call▁end｜>'}}{{'<｜tool▁calls▁end｜><｜end▁of▁sentence｜>'}}{%- endif %}{%- endfor %}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is not none %}{%- if ns.is_tool %}{{'<｜tool▁outputs▁end｜>' + message['content'] + '<｜end▁of▁sentence｜>'}}{%- set ns.is_tool = false -%}{%- else %}{% set content = message['content'] %}{% if '</think>' in content %}{% set content = content.split('</think>')[-1] %}{% endif %}{{'<｜Assistant｜>' + content + '<｜end▁of▁sentence｜>'}}{%- endif %}{%- endif %}{%- if message['role'] == 'tool' %}{%- set ns.is_tool = true -%}{%- if ns.is_output_first %}{{'<｜tool▁outputs▁begin｜><｜tool▁output▁begin｜>' + message['content'] + '<｜tool▁output▁end｜>'}}{%- set ns.is_output_first = false %}{%- else %}{{'\n<｜tool▁output▁begin｜>' + message['content'] + '<｜tool▁output▁end｜>'}}{%- endif %}{%- endif %}{%- endfor -%}{% if ns.is_tool %}{{'<｜tool▁outputs▁end｜>'}}{% endif %}{% if add_generation_prompt and not ns.is_tool %}{{'<｜Assistant｜><think>\n'}}{% endif %}
\ No newline at end of file
diff --git a/dapo_lora_plus_20251202_001141/checkpoint-64/latest b/dapo_lora_plus_20251202_001141/checkpoint-64/latest
new file mode 100644
index 0000000000000000000000000000000000000000..4a12e7f9029554e8e5ce68ebe3e97d0b4e734304
--- /dev/null
+++ b/dapo_lora_plus_20251202_001141/checkpoint-64/latest
@@ -0,0 +1 @@
+global_step64
\ No newline at end of file
diff --git a/dapo_lora_plus_20251202_001141/checkpoint-64/special_tokens_map.json b/dapo_lora_plus_20251202_001141/checkpoint-64/special_tokens_map.json
new file mode 100644
index 0000000000000000000000000000000000000000..1d385d62cf08bca35254547902b792c243656ec1
--- /dev/null
+++ b/dapo_lora_plus_20251202_001141/checkpoint-64/special_tokens_map.json
@@ -0,0 +1,23 @@
+{
+  "bos_token": {
+    "content": "<｜begin▁of▁sentence｜>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "<｜end▁of▁sentence｜>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<｜end▁of▁sentence｜>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}
diff --git a/dapo_lora_plus_20251202_001141/checkpoint-64/tokenizer_config.json b/dapo_lora_plus_20251202_001141/checkpoint-64/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..d252dd4e5764106823080946500c02a8ed8c90c9
--- /dev/null
+++ b/dapo_lora_plus_20251202_001141/checkpoint-64/tokenizer_config.json
@@ -0,0 +1,194 @@
+{
+  "add_bos_token": true,
+  "add_eos_token": false,
+  "add_prefix_space": null,
+  "added_tokens_decoder": {
+    "151643": {
+      "content": "<｜end▁of▁sentence｜>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151644": {
+      "content": "<｜User｜>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151645": {
+      "content": "<｜Assistant｜>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151646": {
+      "content": "<｜begin▁of▁sentence｜>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151647": {
+      "content": "<|EOT|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151648": {
+      "content": "<think>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151649": {
+      "content": "</think>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151650": {
+      "content": "<|quad_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151651": {
+      "content": "<|quad_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151652": {
+      "content": "<|vision_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151653": {
+      "content": "<|vision_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151654": {
+      "content": "<|vision_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151655": {
+      "content": "<|image_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151656": {
+      "content": "<|video_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151657": {
+      "content": "<tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151658": {
+      "content": "</tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151659": {
+      "content": "<|fim_prefix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151660": {
+      "content": "<|fim_middle|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151661": {
+      "content": "<|fim_suffix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151662": {
+      "content": "<|fim_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151663": {
+      "content": "<|repo_name|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151664": {
+      "content": "<|file_sep|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    }
+  },
+  "bos_token": "<｜begin▁of▁sentence｜>",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<｜end▁of▁sentence｜>",
+  "extra_special_tokens": {},
+  "legacy": true,
+  "model_max_length": 16384,
+  "pad_token": "<｜end▁of▁sentence｜>",
+  "sp_model_kwargs": {},
+  "tokenizer_class": "LlamaTokenizerFast",
+  "unk_token": null,
+  "use_default_system_prompt": false
+}
diff --git a/dapo_lora_plus_20251202_001141/checkpoint-64/trainer_state.json b/dapo_lora_plus_20251202_001141/checkpoint-64/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..c211faa74990af61cf4d03795dd1b2c15f6e5375
--- /dev/null
+++ b/dapo_lora_plus_20251202_001141/checkpoint-64/trainer_state.json
@@ -0,0 +1,2018 @@
+{
+  "best_global_step": null,
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 0.05887764489420423,
+  "eval_steps": 500,
+  "global_step": 64,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15689.0,
+      "completions/max_terminated_length": 15689.0,
+      "completions/mean_length": 6039.171875,
+      "completions/mean_terminated_length": 6039.171875,
+      "completions/min_length": 250.0,
+      "completions/min_terminated_length": 250.0,
+      "entropy": 1.19118632376194,
+      "epoch": 0.0009199632014719411,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0025745572056621313,
+      "learning_rate": 1e-05,
+      "loss": 0.0591,
+      "num_tokens": 792270.0,
+      "reward": 0.25,
+      "reward_std": 0.24435341358184814,
+      "rewards/accuracy_reward/mean": 0.25,
+      "rewards/accuracy_reward/std": 0.434714138507843,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999940395355225,
+      "sampling/importance_sampling_ratio/min": 0.0002457273658365011,
+      "sampling/sampling_logp_difference/max": 8.311287879943848,
+      "sampling/sampling_logp_difference/mean": 0.021642697975039482,
+      "step": 1
+    },
+    {
+      "clip_ratio/high_max": 5.499582130141789e-06,
+      "clip_ratio/high_mean": 1.3748955325354473e-06,
+      "clip_ratio/low_mean": 2.871888784738985e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.009378326623846e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 16292.0,
+      "completions/max_terminated_length": 16292.0,
+      "completions/mean_length": 4767.1875,
+      "completions/mean_terminated_length": 4767.1875,
+      "completions/min_length": 556.0,
+      "completions/min_terminated_length": 556.0,
+      "entropy": 1.088237851858139,
+      "epoch": 0.0018399264029438822,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.002068034838885069,
+      "learning_rate": 1e-05,
+      "loss": 0.0258,
+      "num_tokens": 1425798.0,
+      "reward": 0.3046875,
+      "reward_std": 0.29036980867385864,
+      "rewards/accuracy_reward/mean": 0.3046875,
+      "rewards/accuracy_reward/std": 0.46208351850509644,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999016523361206,
+      "sampling/importance_sampling_ratio/min": 0.01811397261917591,
+      "sampling/sampling_logp_difference/max": 4.011071681976318,
+      "sampling/sampling_logp_difference/mean": 0.01877593621611595,
+      "step": 2
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 4.459846724103045e-05,
+      "clip_ratio/low_min": 3.4060874440910993e-06,
+      "clip_ratio/region_mean": 4.459846724103045e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16317.0,
+      "completions/mean_length": 6586.359375,
+      "completions/mean_terminated_length": 6351.21630859375,
+      "completions/min_length": 4.0,
+      "completions/min_terminated_length": 4.0,
+      "entropy": 1.0497623533010483,
+      "epoch": 0.0027598896044158236,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.001971944235265255,
+      "learning_rate": 1e-05,
+      "loss": 0.0199,
+      "num_tokens": 2287420.0,
+      "reward": 0.28125,
+      "reward_std": 0.29143062233924866,
+      "rewards/accuracy_reward/mean": 0.28125,
+      "rewards/accuracy_reward/std": 0.4513758420944214,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999316334724426,
+      "sampling/importance_sampling_ratio/min": 5.356698966352269e-05,
+      "sampling/sampling_logp_difference/max": 9.834577560424805,
+      "sampling/sampling_logp_difference/mean": 0.02137824520468712,
+      "step": 3
+    },
+    {
+      "clip_ratio/high_max": 1.7640652004047297e-05,
+      "clip_ratio/high_mean": 5.48578327652649e-06,
+      "clip_ratio/low_mean": 3.218628648937738e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.767206976590387e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 14690.0,
+      "completions/max_terminated_length": 14690.0,
+      "completions/mean_length": 5448.0234375,
+      "completions/mean_terminated_length": 5448.0234375,
+      "completions/min_length": 707.0,
+      "completions/min_terminated_length": 707.0,
+      "entropy": 1.1134418621659279,
+      "epoch": 0.0036798528058877645,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0016465173102915287,
+      "learning_rate": 1e-05,
+      "loss": 0.0433,
+      "num_tokens": 3009167.0,
+      "reward": 0.2890625,
+      "reward_std": 0.27958330512046814,
+      "rewards/accuracy_reward/mean": 0.2890625,
+      "rewards/accuracy_reward/std": 0.45510825514793396,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999774694442749,
+      "sampling/importance_sampling_ratio/min": 7.889385415182915e-06,
+      "sampling/sampling_logp_difference/max": 11.749992370605469,
+      "sampling/sampling_logp_difference/mean": 0.020580951124429703,
+      "step": 4
+    },
+    {
+      "clip_ratio/high_max": 1.3439519989333348e-05,
+      "clip_ratio/high_mean": 3.359879997333337e-06,
+      "clip_ratio/low_mean": 2.8849915906903334e-05,
+      "clip_ratio/low_min": 8.467687621305231e-06,
+      "clip_ratio/region_mean": 3.220979442630778e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 13420.0,
+      "completions/mean_length": 5436.8671875,
+      "completions/mean_terminated_length": 5350.66943359375,
+      "completions/min_length": 26.0,
+      "completions/min_terminated_length": 26.0,
+      "entropy": 1.1473859176039696,
+      "epoch": 0.004599816007359705,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0023770295083522797,
+      "learning_rate": 1e-05,
+      "loss": 0.0153,
+      "num_tokens": 3725654.0,
+      "reward": 0.2734375,
+      "reward_std": 0.27434611320495605,
+      "rewards/accuracy_reward/mean": 0.2734375,
+      "rewards/accuracy_reward/std": 0.447474867105484,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.99991774559021,
+      "sampling/importance_sampling_ratio/min": 0.0011146117467433214,
+      "sampling/sampling_logp_difference/max": 6.799249172210693,
+      "sampling/sampling_logp_difference/mean": 0.020377254113554955,
+      "step": 5
+    },
+    {
+      "clip_ratio/high_max": 4.652201369026443e-06,
+      "clip_ratio/high_mean": 1.1630503422566107e-06,
+      "clip_ratio/low_mean": 2.8399212624208303e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.9562263534899103e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 14440.0,
+      "completions/max_terminated_length": 14440.0,
+      "completions/mean_length": 4697.5390625,
+      "completions/mean_terminated_length": 4697.5390625,
+      "completions/min_length": 445.0,
+      "completions/min_terminated_length": 445.0,
+      "entropy": 1.0097229778766632,
+      "epoch": 0.005519779208831647,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.003342699259519577,
+      "learning_rate": 1e-05,
+      "loss": 0.0326,
+      "num_tokens": 4345547.0,
+      "reward": 0.390625,
+      "reward_std": 0.34480881690979004,
+      "rewards/accuracy_reward/mean": 0.390625,
+      "rewards/accuracy_reward/std": 0.4898075461387634,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999914765357971,
+      "sampling/importance_sampling_ratio/min": 0.002385853324085474,
+      "sampling/sampling_logp_difference/max": 6.038198471069336,
+      "sampling/sampling_logp_difference/mean": 0.0185473021119833,
+      "step": 6
+    },
+    {
+      "clip_ratio/high_max": 9.362594937556423e-06,
+      "clip_ratio/high_mean": 2.340648734389106e-06,
+      "clip_ratio/low_mean": 6.054362825125281e-05,
+      "clip_ratio/low_min": 7.427356649714056e-06,
+      "clip_ratio/region_mean": 6.288427744038927e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14652.0,
+      "completions/mean_length": 6218.2109375,
+      "completions/mean_terminated_length": 5890.2822265625,
+      "completions/min_length": 156.0,
+      "completions/min_terminated_length": 156.0,
+      "entropy": 1.0579778030514717,
+      "epoch": 0.006439742410303588,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.002073560608550906,
+      "learning_rate": 1e-05,
+      "loss": 0.0201,
+      "num_tokens": 5160646.0,
+      "reward": 0.2109375,
+      "reward_std": 0.27222445607185364,
+      "rewards/accuracy_reward/mean": 0.2109375,
+      "rewards/accuracy_reward/std": 0.4095771610736847,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999560117721558,
+      "sampling/importance_sampling_ratio/min": 0.00044544730917550623,
+      "sampling/sampling_logp_difference/max": 7.716431617736816,
+      "sampling/sampling_logp_difference/mean": 0.020321575924754143,
+      "step": 7
+    },
+    {
+      "clip_ratio/high_max": 1.1064067621191498e-05,
+      "clip_ratio/high_mean": 2.7660169052978745e-06,
+      "clip_ratio/low_mean": 2.2175867059104348e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.4941883737028547e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 13637.0,
+      "completions/mean_length": 5127.8359375,
+      "completions/mean_terminated_length": 5039.20458984375,
+      "completions/min_length": 556.0,
+      "completions/min_terminated_length": 556.0,
+      "entropy": 1.0472618415951729,
+      "epoch": 0.007359705611775529,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0032994600478559732,
+      "learning_rate": 1e-05,
+      "loss": 0.0751,
+      "num_tokens": 5836289.0,
+      "reward": 0.3359375,
+      "reward_std": 0.2948455810546875,
+      "rewards/accuracy_reward/mean": 0.3359375,
+      "rewards/accuracy_reward/std": 0.47417303919792175,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999483227729797,
+      "sampling/importance_sampling_ratio/min": 0.0013780994340777397,
+      "sampling/sampling_logp_difference/max": 6.587049961090088,
+      "sampling/sampling_logp_difference/mean": 0.01940803974866867,
+      "step": 8
+    },
+    {
+      "clip_ratio/high_max": 1.2357884770608507e-05,
+      "clip_ratio/high_mean": 3.0894711926521268e-06,
+      "clip_ratio/low_mean": 3.000627111759968e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.309574231025181e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15916.0,
+      "completions/mean_length": 4516.890625,
+      "completions/mean_terminated_length": 4423.44873046875,
+      "completions/min_length": 238.0,
+      "completions/min_terminated_length": 238.0,
+      "entropy": 0.911251038312912,
+      "epoch": 0.00827966881324747,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.003016560571268201,
+      "learning_rate": 1e-05,
+      "loss": 0.1006,
+      "num_tokens": 6433171.0,
+      "reward": 0.390625,
+      "reward_std": 0.3066929578781128,
+      "rewards/accuracy_reward/mean": 0.390625,
+      "rewards/accuracy_reward/std": 0.4898075461387634,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999179840087891,
+      "sampling/importance_sampling_ratio/min": 0.005480794236063957,
+      "sampling/sampling_logp_difference/max": 5.206505298614502,
+      "sampling/sampling_logp_difference/mean": 0.017437148839235306,
+      "step": 9
+    },
+    {
+      "clip_ratio/high_max": 4.6329013457580004e-05,
+      "clip_ratio/high_mean": 1.1582253364395001e-05,
+      "clip_ratio/low_mean": 7.069455705277505e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 8.227681109929108e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 13970.0,
+      "completions/mean_length": 4961.453125,
+      "completions/mean_terminated_length": 4687.31201171875,
+      "completions/min_length": 483.0,
+      "completions/min_terminated_length": 483.0,
+      "entropy": 0.6808596402406693,
+      "epoch": 0.00919963201471941,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.0035386616364121437,
+      "learning_rate": 1e-05,
+      "loss": 0.0596,
+      "num_tokens": 7085389.0,
+      "reward": 0.5625,
+      "reward_std": 0.3816363215446472,
+      "rewards/accuracy_reward/mean": 0.5625,
+      "rewards/accuracy_reward/std": 0.49802759289741516,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999173879623413,
+      "sampling/importance_sampling_ratio/min": 0.0002734088629949838,
+      "sampling/sampling_logp_difference/max": 8.20454216003418,
+      "sampling/sampling_logp_difference/mean": 0.01566406339406967,
+      "step": 10
+    },
+    {
+      "clip_ratio/high_max": 2.43190661421977e-05,
+      "clip_ratio/high_mean": 6.079766535549425e-06,
+      "clip_ratio/low_mean": 2.2395396172214532e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.8475162707763957e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14776.0,
+      "completions/mean_length": 4429.40625,
+      "completions/mean_terminated_length": 4335.275390625,
+      "completions/min_length": 25.0,
+      "completions/min_terminated_length": 25.0,
+      "entropy": 0.9181502386927605,
+      "epoch": 0.010119595216191352,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.0022535293828696012,
+      "learning_rate": 1e-05,
+      "loss": 0.0031,
+      "num_tokens": 7672185.0,
+      "reward": 0.3671875,
+      "reward_std": 0.20357418060302734,
+      "rewards/accuracy_reward/mean": 0.3671875,
+      "rewards/accuracy_reward/std": 0.4839322865009308,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998801946640015,
+      "sampling/importance_sampling_ratio/min": 5.315856554943821e-08,
+      "sampling/sampling_logp_difference/max": 16.74998664855957,
+      "sampling/sampling_logp_difference/mean": 0.018429335206747055,
+      "step": 11
+    },
+    {
+      "clip_ratio/high_max": 1.0117325928149512e-05,
+      "clip_ratio/high_mean": 2.529331482037378e-06,
+      "clip_ratio/low_mean": 1.1982813475697185e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 1.45121450714214e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14029.0,
+      "completions/mean_length": 5282.6796875,
+      "completions/mean_terminated_length": 5106.46875,
+      "completions/min_length": 323.0,
+      "completions/min_terminated_length": 323.0,
+      "entropy": 1.113751620054245,
+      "epoch": 0.011039558417663294,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0013591813622042537,
+      "learning_rate": 1e-05,
+      "loss": 0.0971,
+      "num_tokens": 8369000.0,
+      "reward": 0.3984375,
+      "reward_std": 0.3029736578464508,
+      "rewards/accuracy_reward/mean": 0.3984375,
+      "rewards/accuracy_reward/std": 0.4915000796318054,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998897314071655,
+      "sampling/importance_sampling_ratio/min": 3.970265970565379e-05,
+      "sampling/sampling_logp_difference/max": 10.134092330932617,
+      "sampling/sampling_logp_difference/mean": 0.020221836864948273,
+      "step": 12
+    },
+    {
+      "clip_ratio/high_max": 5.411958227341529e-06,
+      "clip_ratio/high_mean": 1.3529895568353822e-06,
+      "clip_ratio/low_mean": 2.5284593846208736e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.6637583516730956e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15925.0,
+      "completions/mean_length": 6970.421875,
+      "completions/mean_terminated_length": 6744.49609375,
+      "completions/min_length": 53.0,
+      "completions/min_terminated_length": 53.0,
+      "entropy": 1.1721933633089066,
+      "epoch": 0.011959521619135235,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.0024079051800072193,
+      "learning_rate": 1e-05,
+      "loss": 0.0713,
+      "num_tokens": 9283182.0,
+      "reward": 0.171875,
+      "reward_std": 0.17965975403785706,
+      "rewards/accuracy_reward/mean": 0.171875,
+      "rewards/accuracy_reward/std": 0.3787541687488556,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999163746833801,
+      "sampling/importance_sampling_ratio/min": 0.0008915197686292231,
+      "sampling/sampling_logp_difference/max": 7.0225830078125,
+      "sampling/sampling_logp_difference/mean": 0.021462474018335342,
+      "step": 13
+    },
+    {
+      "clip_ratio/high_max": 2.0661535927501973e-05,
+      "clip_ratio/high_mean": 5.165383981875493e-06,
+      "clip_ratio/low_mean": 2.4304956298237812e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.947033948430544e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 14658.0,
+      "completions/max_terminated_length": 14658.0,
+      "completions/mean_length": 4886.875,
+      "completions/mean_terminated_length": 4886.875,
+      "completions/min_length": 434.0,
+      "completions/min_terminated_length": 434.0,
+      "entropy": 1.0108910650014877,
+      "epoch": 0.012879484820607176,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.002063734456896782,
+      "learning_rate": 1e-05,
+      "loss": 0.0386,
+      "num_tokens": 9928446.0,
+      "reward": 0.3515625,
+      "reward_std": 0.2409384697675705,
+      "rewards/accuracy_reward/mean": 0.3515625,
+      "rewards/accuracy_reward/std": 0.4793342351913452,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000026226043701,
+      "sampling/importance_sampling_ratio/min": 0.0003672837920021266,
+      "sampling/sampling_logp_difference/max": 7.9093756675720215,
+      "sampling/sampling_logp_difference/mean": 0.01918785460293293,
+      "step": 14
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 4.4761846993424115e-06,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.4761846993424115e-06,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 12992.0,
+      "completions/max_terminated_length": 12992.0,
+      "completions/mean_length": 4824.0078125,
+      "completions/mean_terminated_length": 4824.0078125,
+      "completions/min_length": 103.0,
+      "completions/min_terminated_length": 103.0,
+      "entropy": 1.1070282831788063,
+      "epoch": 0.013799448022079117,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.002424790756776929,
+      "learning_rate": 1e-05,
+      "loss": 0.0485,
+      "num_tokens": 10566415.0,
+      "reward": 0.28125,
+      "reward_std": 0.23698672652244568,
+      "rewards/accuracy_reward/mean": 0.28125,
+      "rewards/accuracy_reward/std": 0.4513758420944214,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000125169754028,
+      "sampling/importance_sampling_ratio/min": 0.0011708867968991399,
+      "sampling/sampling_logp_difference/max": 6.749993801116943,
+      "sampling/sampling_logp_difference/mean": 0.02069389820098877,
+      "step": 15
+    },
+    {
+      "clip_ratio/high_max": 3.5075904634140898e-06,
+      "clip_ratio/high_mean": 8.768976158535224e-07,
+      "clip_ratio/low_mean": 2.2676964135825983e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.3553861751679506e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 12685.0,
+      "completions/mean_length": 5449.4140625,
+      "completions/mean_terminated_length": 5363.31494140625,
+      "completions/min_length": 23.0,
+      "completions/min_terminated_length": 23.0,
+      "entropy": 0.9817888736724854,
+      "epoch": 0.014719411223551058,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0021046048495918512,
+      "learning_rate": 1e-05,
+      "loss": 0.0252,
+      "num_tokens": 11281908.0,
+      "reward": 0.2265625,
+      "reward_std": 0.27168765664100647,
+      "rewards/accuracy_reward/mean": 0.2265625,
+      "rewards/accuracy_reward/std": 0.4202519655227661,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999805688858032,
+      "sampling/importance_sampling_ratio/min": 0.013273254036903381,
+      "sampling/sampling_logp_difference/max": 4.322004318237305,
+      "sampling/sampling_logp_difference/mean": 0.019556276500225067,
+      "step": 16
+    },
+    {
+      "clip_ratio/high_max": 1.624216065465589e-05,
+      "clip_ratio/high_mean": 4.060540163663973e-06,
+      "clip_ratio/low_mean": 5.4349347919924185e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.840988796990132e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 14133.0,
+      "completions/max_terminated_length": 14133.0,
+      "completions/mean_length": 5343.25,
+      "completions/mean_terminated_length": 5343.25,
+      "completions/min_length": 324.0,
+      "completions/min_terminated_length": 324.0,
+      "entropy": 1.04741720110178,
+      "epoch": 0.015639374425023,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0035894038155674934,
+      "learning_rate": 1e-05,
+      "loss": 0.0584,
+      "num_tokens": 11987692.0,
+      "reward": 0.3359375,
+      "reward_std": 0.3124620020389557,
+      "rewards/accuracy_reward/mean": 0.3359375,
+      "rewards/accuracy_reward/std": 0.47417303919792175,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998996257781982,
+      "sampling/importance_sampling_ratio/min": 2.1446165192173794e-05,
+      "sampling/sampling_logp_difference/max": 10.749964714050293,
+      "sampling/sampling_logp_difference/mean": 0.020530637353658676,
+      "step": 17
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 4.272115029380075e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.272115029380075e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15138.0,
+      "completions/mean_length": 6301.9375,
+      "completions/mean_terminated_length": 5806.09814453125,
+      "completions/min_length": 72.0,
+      "completions/min_terminated_length": 72.0,
+      "entropy": 0.8892941772937775,
+      "epoch": 0.01655933762649494,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0032246762420982122,
+      "learning_rate": 1e-05,
+      "loss": 0.0811,
+      "num_tokens": 12814244.0,
+      "reward": 0.3125,
+      "reward_std": 0.3606000542640686,
+      "rewards/accuracy_reward/mean": 0.3125,
+      "rewards/accuracy_reward/std": 0.4653336703777313,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999184608459473,
+      "sampling/importance_sampling_ratio/min": 0.021351110190153122,
+      "sampling/sampling_logp_difference/max": 3.846651554107666,
+      "sampling/sampling_logp_difference/mean": 0.017541853711009026,
+      "step": 18
+    },
+    {
+      "clip_ratio/high_max": 9.956602298188955e-06,
+      "clip_ratio/high_mean": 2.4891505745472386e-06,
+      "clip_ratio/low_mean": 2.772165316855535e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.0210803743102588e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 16213.0,
+      "completions/max_terminated_length": 16213.0,
+      "completions/mean_length": 5297.46875,
+      "completions/mean_terminated_length": 5297.46875,
+      "completions/min_length": 2.0,
+      "completions/min_terminated_length": 2.0,
+      "entropy": 0.8097029253840446,
+      "epoch": 0.017479300827966882,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0023969109170138836,
+      "learning_rate": 1e-05,
+      "loss": -0.0153,
+      "num_tokens": 13512520.0,
+      "reward": 0.359375,
+      "reward_std": 0.248829185962677,
+      "rewards/accuracy_reward/mean": 0.359375,
+      "rewards/accuracy_reward/std": 0.481702595949173,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999222159385681,
+      "sampling/importance_sampling_ratio/min": 0.005766105372458696,
+      "sampling/sampling_logp_difference/max": 5.155758380889893,
+      "sampling/sampling_logp_difference/mean": 0.017464376986026764,
+      "step": 19
+    },
+    {
+      "clip_ratio/high_max": 1.0098337497765897e-05,
+      "clip_ratio/high_mean": 2.524584374441474e-06,
+      "clip_ratio/low_mean": 3.173396362399217e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.425854845318099e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14655.0,
+      "completions/mean_length": 4890.34375,
+      "completions/mean_terminated_length": 4799.84228515625,
+      "completions/min_length": 68.0,
+      "completions/min_terminated_length": 68.0,
+      "entropy": 0.9267145916819572,
+      "epoch": 0.01839926402943882,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.002759338356554508,
+      "learning_rate": 1e-05,
+      "loss": -0.0014,
+      "num_tokens": 14155556.0,
+      "reward": 0.3515625,
+      "reward_std": 0.31010788679122925,
+      "rewards/accuracy_reward/mean": 0.3515625,
+      "rewards/accuracy_reward/std": 0.4793342351913452,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999570250511169,
+      "sampling/importance_sampling_ratio/min": 0.008491010405123234,
+      "sampling/sampling_logp_difference/max": 4.768747329711914,
+      "sampling/sampling_logp_difference/mean": 0.018839433789253235,
+      "step": 20
+    },
+    {
+      "clip_ratio/high_max": 7.532389190600952e-06,
+      "clip_ratio/high_mean": 1.883097297650238e-06,
+      "clip_ratio/low_mean": 1.9051809317716106e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.0934906729053182e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 16296.0,
+      "completions/max_terminated_length": 16296.0,
+      "completions/mean_length": 4609.40625,
+      "completions/mean_terminated_length": 4609.40625,
+      "completions/min_length": 461.0,
+      "completions/min_terminated_length": 461.0,
+      "entropy": 1.171089917421341,
+      "epoch": 0.019319227230910764,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0021055075339972973,
+      "learning_rate": 1e-05,
+      "loss": -0.0051,
+      "num_tokens": 14765328.0,
+      "reward": 0.2421875,
+      "reward_std": 0.2409384548664093,
+      "rewards/accuracy_reward/mean": 0.2421875,
+      "rewards/accuracy_reward/std": 0.4300905168056488,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999741911888123,
+      "sampling/importance_sampling_ratio/min": 5.368983693188056e-07,
+      "sampling/sampling_logp_difference/max": 14.437457084655762,
+      "sampling/sampling_logp_difference/mean": 0.020226795226335526,
+      "step": 21
+    },
+    {
+      "clip_ratio/high_max": 1.7169573766295798e-05,
+      "clip_ratio/high_mean": 4.2923934415739495e-06,
+      "clip_ratio/low_mean": 5.869748633813288e-06,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 1.0162142189074075e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14299.0,
+      "completions/mean_length": 5099.0390625,
+      "completions/mean_terminated_length": 5010.18115234375,
+      "completions/min_length": 539.0,
+      "completions/min_terminated_length": 539.0,
+      "entropy": 1.005959376692772,
+      "epoch": 0.020239190432382703,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0027595218271017075,
+      "learning_rate": 1e-05,
+      "loss": 0.0236,
+      "num_tokens": 15438549.0,
+      "reward": 0.296875,
+      "reward_std": 0.20069602131843567,
+      "rewards/accuracy_reward/mean": 0.296875,
+      "rewards/accuracy_reward/std": 0.45867621898651123,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999887347221375,
+      "sampling/importance_sampling_ratio/min": 0.00013984869292471558,
+      "sampling/sampling_logp_difference/max": 8.87494945526123,
+      "sampling/sampling_logp_difference/mean": 0.01902824640274048,
+      "step": 22
+    },
+    {
+      "clip_ratio/high_max": 5.162942670722259e-06,
+      "clip_ratio/high_mean": 1.2907356676805648e-06,
+      "clip_ratio/low_mean": 3.6872071063953626e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.816280593582633e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16204.0,
+      "completions/mean_length": 7138.0390625,
+      "completions/mean_terminated_length": 6839.7822265625,
+      "completions/min_length": 729.0,
+      "completions/min_terminated_length": 729.0,
+      "entropy": 1.0403362140059471,
+      "epoch": 0.021159153633854646,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.002748022088780999,
+      "learning_rate": 1e-05,
+      "loss": 0.0647,
+      "num_tokens": 16373898.0,
+      "reward": 0.296875,
+      "reward_std": 0.3169426918029785,
+      "rewards/accuracy_reward/mean": 0.296875,
+      "rewards/accuracy_reward/std": 0.45867621898651123,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999048709869385,
+      "sampling/importance_sampling_ratio/min": 0.0003802926803473383,
+      "sampling/sampling_logp_difference/max": 7.874569416046143,
+      "sampling/sampling_logp_difference/mean": 0.020853528752923012,
+      "step": 23
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 5.6506045439164154e-05,
+      "clip_ratio/low_min": 5.709326615033206e-06,
+      "clip_ratio/region_mean": 5.6506045439164154e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14543.0,
+      "completions/mean_length": 5420.515625,
+      "completions/mean_terminated_length": 5334.18896484375,
+      "completions/min_length": 19.0,
+      "completions/min_terminated_length": 19.0,
+      "entropy": 1.1339883506298065,
+      "epoch": 0.02207911683532659,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0029502976685762405,
+      "learning_rate": 1e-05,
+      "loss": 0.0756,
+      "num_tokens": 17088156.0,
+      "reward": 0.1953125,
+      "reward_std": 0.25620076060295105,
+      "rewards/accuracy_reward/mean": 0.1953125,
+      "rewards/accuracy_reward/std": 0.3979988098144531,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999445676803589,
+      "sampling/importance_sampling_ratio/min": 9.70982582657598e-05,
+      "sampling/sampling_logp_difference/max": 9.239787101745605,
+      "sampling/sampling_logp_difference/mean": 0.0199423898011446,
+      "step": 24
+    },
+    {
+      "clip_ratio/high_max": 5.619998319161823e-06,
+      "clip_ratio/high_mean": 1.4049995797904558e-06,
+      "clip_ratio/low_mean": 6.439320418394345e-05,
+      "clip_ratio/low_min": 4.70632539872895e-06,
+      "clip_ratio/region_mean": 6.57982034226734e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14636.0,
+      "completions/mean_length": 5116.3046875,
+      "completions/mean_terminated_length": 4845.88037109375,
+      "completions/min_length": 10.0,
+      "completions/min_terminated_length": 10.0,
+      "entropy": 0.9503882825374603,
+      "epoch": 0.022999080036798528,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.004891107324510813,
+      "learning_rate": 1e-05,
+      "loss": 0.0522,
+      "num_tokens": 17766619.0,
+      "reward": 0.3203125,
+      "reward_std": 0.3366856575012207,
+      "rewards/accuracy_reward/mean": 0.3203125,
+      "rewards/accuracy_reward/std": 0.4684300124645233,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999970018863678,
+      "sampling/importance_sampling_ratio/min": 0.0010618992382660508,
+      "sampling/sampling_logp_difference/max": 6.847696304321289,
+      "sampling/sampling_logp_difference/mean": 0.01914183795452118,
+      "step": 25
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 3.839018643247982e-05,
+      "clip_ratio/low_min": 4.115091087442124e-06,
+      "clip_ratio/region_mean": 3.839018643247982e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14634.0,
+      "completions/mean_length": 5061.8671875,
+      "completions/mean_terminated_length": 4972.71630859375,
+      "completions/min_length": 281.0,
+      "completions/min_terminated_length": 281.0,
+      "entropy": 1.0540335327386856,
+      "epoch": 0.02391904323827047,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0030373274348676205,
+      "learning_rate": 1e-05,
+      "loss": 0.0246,
+      "num_tokens": 18432938.0,
+      "reward": 0.34375,
+      "reward_std": 0.28118088841438293,
+      "rewards/accuracy_reward/mean": 0.34375,
+      "rewards/accuracy_reward/std": 0.47682511806488037,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999624490737915,
+      "sampling/importance_sampling_ratio/min": 1.7212972807101323e-06,
+      "sampling/sampling_logp_difference/max": 13.272432327270508,
+      "sampling/sampling_logp_difference/mean": 0.019548218697309494,
+      "step": 26
+    },
+    {
+      "clip_ratio/high_max": 1.4656657867817557e-05,
+      "clip_ratio/high_mean": 4.665093399580655e-06,
+      "clip_ratio/low_mean": 3.751162262233265e-05,
+      "clip_ratio/low_min": 4.413062470121076e-06,
+      "clip_ratio/region_mean": 4.2176716192443564e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15782.0,
+      "completions/max_terminated_length": 15782.0,
+      "completions/mean_length": 6349.9765625,
+      "completions/mean_terminated_length": 6349.9765625,
+      "completions/min_length": 2.0,
+      "completions/min_terminated_length": 2.0,
+      "entropy": 1.0268081277608871,
+      "epoch": 0.02483900643974241,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0017623496241867542,
+      "learning_rate": 1e-05,
+      "loss": 0.0011,
+      "num_tokens": 19264743.0,
+      "reward": 0.2734375,
+      "reward_std": 0.33903974294662476,
+      "rewards/accuracy_reward/mean": 0.2734375,
+      "rewards/accuracy_reward/std": 0.447474867105484,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000061988830566,
+      "sampling/importance_sampling_ratio/min": 6.870362267363816e-05,
+      "sampling/sampling_logp_difference/max": 9.585708618164062,
+      "sampling/sampling_logp_difference/mean": 0.019106190651655197,
+      "step": 27
+    },
+    {
+      "clip_ratio/high_max": 9.221375876222737e-06,
+      "clip_ratio/high_mean": 2.3053439690556843e-06,
+      "clip_ratio/low_mean": 3.09787185415189e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.328406273794826e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15944.0,
+      "completions/mean_length": 5815.484375,
+      "completions/mean_terminated_length": 5561.84033203125,
+      "completions/min_length": 607.0,
+      "completions/min_terminated_length": 607.0,
+      "entropy": 1.0389493256807327,
+      "epoch": 0.025758969641214352,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.003111837198957801,
+      "learning_rate": 1e-05,
+      "loss": -0.0162,
+      "num_tokens": 20030109.0,
+      "reward": 0.34375,
+      "reward_std": 0.32719242572784424,
+      "rewards/accuracy_reward/mean": 0.34375,
+      "rewards/accuracy_reward/std": 0.47682511806488037,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000298023223877,
+      "sampling/importance_sampling_ratio/min": 0.02987043187022209,
+      "sampling/sampling_logp_difference/max": 3.5108861923217773,
+      "sampling/sampling_logp_difference/mean": 0.020060991868376732,
+      "step": 28
+    },
+    {
+      "clip_ratio/high_max": 6.7810142354574054e-06,
+      "clip_ratio/high_mean": 1.6952535588643514e-06,
+      "clip_ratio/low_mean": 4.474762545214617e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.644287901101052e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15371.0,
+      "completions/mean_length": 5157.1484375,
+      "completions/mean_terminated_length": 5068.748046875,
+      "completions/min_length": 11.0,
+      "completions/min_terminated_length": 11.0,
+      "entropy": 1.0510126948356628,
+      "epoch": 0.02667893284268629,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.003041633637621999,
+      "learning_rate": 1e-05,
+      "loss": 0.0471,
+      "num_tokens": 20710904.0,
+      "reward": 0.3125,
+      "reward_std": 0.35612428188323975,
+      "rewards/accuracy_reward/mean": 0.3125,
+      "rewards/accuracy_reward/std": 0.4653336703777313,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999587535858154,
+      "sampling/importance_sampling_ratio/min": 0.04357198625802994,
+      "sampling/sampling_logp_difference/max": 3.133340835571289,
+      "sampling/sampling_logp_difference/mean": 0.019007597118616104,
+      "step": 29
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 2.0962848566341563e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.0962848566341563e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15333.0,
+      "completions/max_terminated_length": 15333.0,
+      "completions/mean_length": 4446.3828125,
+      "completions/mean_terminated_length": 4446.3828125,
+      "completions/min_length": 73.0,
+      "completions/min_terminated_length": 73.0,
+      "entropy": 1.053279548883438,
+      "epoch": 0.027598896044158234,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0022369560319930315,
+      "learning_rate": 1e-05,
+      "loss": -0.001,
+      "num_tokens": 21298497.0,
+      "reward": 0.390625,
+      "reward_std": 0.24169495701789856,
+      "rewards/accuracy_reward/mean": 0.390625,
+      "rewards/accuracy_reward/std": 0.4898075461387634,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998750686645508,
+      "sampling/importance_sampling_ratio/min": 0.006704842206090689,
+      "sampling/sampling_logp_difference/max": 5.00492525100708,
+      "sampling/sampling_logp_difference/mean": 0.01947362720966339,
+      "step": 30
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 2.8460265411922592e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.8460265411922592e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15386.0,
+      "completions/mean_length": 6294.1484375,
+      "completions/mean_terminated_length": 6133.9921875,
+      "completions/min_length": 548.0,
+      "completions/min_terminated_length": 548.0,
+      "entropy": 1.2036212533712387,
+      "epoch": 0.028518859245630176,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0021383841522037983,
+      "learning_rate": 1e-05,
+      "loss": 0.033,
+      "num_tokens": 22124812.0,
+      "reward": 0.171875,
+      "reward_std": 0.20752590894699097,
+      "rewards/accuracy_reward/mean": 0.171875,
+      "rewards/accuracy_reward/std": 0.3787541687488556,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999858736991882,
+      "sampling/importance_sampling_ratio/min": 3.9575263599544996e-07,
+      "sampling/sampling_logp_difference/max": 14.742476463317871,
+      "sampling/sampling_logp_difference/mean": 0.022367021068930626,
+      "step": 31
+    },
+    {
+      "clip_ratio/high_max": 1.73864664247958e-05,
+      "clip_ratio/high_mean": 4.34661660619895e-06,
+      "clip_ratio/low_mean": 3.19569651310303e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.630358173722925e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14893.0,
+      "completions/mean_length": 6011.4921875,
+      "completions/mean_terminated_length": 5929.81884765625,
+      "completions/min_length": 2.0,
+      "completions/min_terminated_length": 2.0,
+      "entropy": 1.123318687081337,
+      "epoch": 0.029438822447102116,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.00126531848218292,
+      "learning_rate": 1e-05,
+      "loss": 0.0119,
+      "num_tokens": 22915091.0,
+      "reward": 0.171875,
+      "reward_std": 0.2330477386713028,
+      "rewards/accuracy_reward/mean": 0.171875,
+      "rewards/accuracy_reward/std": 0.3787541687488556,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999861121177673,
+      "sampling/importance_sampling_ratio/min": 1.6368276192224585e-05,
+      "sampling/sampling_logp_difference/max": 11.02016544342041,
+      "sampling/sampling_logp_difference/mean": 0.019905246794223785,
+      "step": 32
+    },
+    {
+      "clip_ratio/high_max": 2.8753217975463485e-05,
+      "clip_ratio/high_mean": 7.188304493865871e-06,
+      "clip_ratio/low_mean": 3.818478444372886e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.537308905128157e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16332.0,
+      "completions/mean_length": 5152.46875,
+      "completions/mean_terminated_length": 5064.03125,
+      "completions/min_length": 128.0,
+      "completions/min_terminated_length": 128.0,
+      "entropy": 1.0477670058608055,
+      "epoch": 0.03035878564857406,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0030069497879594564,
+      "learning_rate": 1e-05,
+      "loss": 0.1026,
+      "num_tokens": 23596487.0,
+      "reward": 0.3359375,
+      "reward_std": 0.29142576456069946,
+      "rewards/accuracy_reward/mean": 0.3359375,
+      "rewards/accuracy_reward/std": 0.47417303919792175,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999433755874634,
+      "sampling/importance_sampling_ratio/min": 9.009604013954231e-07,
+      "sampling/sampling_logp_difference/max": 13.919804573059082,
+      "sampling/sampling_logp_difference/mean": 0.019003981724381447,
+      "step": 33
+    },
+    {
+      "clip_ratio/high_max": 3.069575450354023e-05,
+      "clip_ratio/high_mean": 7.673938625885057e-06,
+      "clip_ratio/low_mean": 3.4847614415411954e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.252155258654966e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 12792.0,
+      "completions/max_terminated_length": 12792.0,
+      "completions/mean_length": 4672.5703125,
+      "completions/mean_terminated_length": 4672.5703125,
+      "completions/min_length": 4.0,
+      "completions/min_terminated_length": 4.0,
+      "entropy": 0.9471446052193642,
+      "epoch": 0.031278748850046,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.002676331205293536,
+      "learning_rate": 1e-05,
+      "loss": 0.0724,
+      "num_tokens": 24213408.0,
+      "reward": 0.3203125,
+      "reward_std": 0.2988021969795227,
+      "rewards/accuracy_reward/mean": 0.3203125,
+      "rewards/accuracy_reward/std": 0.4684300124645233,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000251531600952,
+      "sampling/importance_sampling_ratio/min": 0.0013351094676181674,
+      "sampling/sampling_logp_difference/max": 6.618741989135742,
+      "sampling/sampling_logp_difference/mean": 0.0179576613008976,
+      "step": 34
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 2.6127243245355203e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.6127243245355203e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16108.0,
+      "completions/mean_length": 7013.734375,
+      "completions/mean_terminated_length": 6711.4677734375,
+      "completions/min_length": 326.0,
+      "completions/min_terminated_length": 326.0,
+      "entropy": 1.1254516392946243,
+      "epoch": 0.03219871205151794,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0023615453392267227,
+      "learning_rate": 1e-05,
+      "loss": 0.0384,
+      "num_tokens": 25130262.0,
+      "reward": 0.1953125,
+      "reward_std": 0.26485776901245117,
+      "rewards/accuracy_reward/mean": 0.1953125,
+      "rewards/accuracy_reward/std": 0.3979988098144531,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999954342842102,
+      "sampling/importance_sampling_ratio/min": 6.6197676460433286e-06,
+      "sampling/sampling_logp_difference/max": 11.925450325012207,
+      "sampling/sampling_logp_difference/mean": 0.0215257927775383,
+      "step": 35
+    },
+    {
+      "clip_ratio/high_max": 4.06954040954588e-06,
+      "clip_ratio/high_mean": 1.01738510238647e-06,
+      "clip_ratio/low_mean": 4.180071573500754e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.281810015527299e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15673.0,
+      "completions/mean_length": 5858.59375,
+      "completions/mean_terminated_length": 5605.984375,
+      "completions/min_length": 189.0,
+      "completions/min_terminated_length": 189.0,
+      "entropy": 1.0713739022612572,
+      "epoch": 0.03311867525298988,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0029018481727689505,
+      "learning_rate": 1e-05,
+      "loss": 0.1041,
+      "num_tokens": 25898194.0,
+      "reward": 0.3671875,
+      "reward_std": 0.29036980867385864,
+      "rewards/accuracy_reward/mean": 0.3671875,
+      "rewards/accuracy_reward/std": 0.4839322865009308,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999915957450867,
+      "sampling/importance_sampling_ratio/min": 1.6834765119710937e-05,
+      "sampling/sampling_logp_difference/max": 10.992064476013184,
+      "sampling/sampling_logp_difference/mean": 0.019959844648838043,
+      "step": 36
+    },
+    {
+      "clip_ratio/high_max": 1.2810827229259303e-05,
+      "clip_ratio/high_mean": 3.2027068073148257e-06,
+      "clip_ratio/low_mean": 3.29701083501277e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.617281504375569e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14004.0,
+      "completions/mean_length": 6952.6015625,
+      "completions/mean_terminated_length": 6726.24853515625,
+      "completions/min_length": 3.0,
+      "completions/min_terminated_length": 3.0,
+      "entropy": 1.028619796037674,
+      "epoch": 0.03403863845446182,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0022342968732118607,
+      "learning_rate": 1e-05,
+      "loss": 0.0637,
+      "num_tokens": 26812791.0,
+      "reward": 0.234375,
+      "reward_std": 0.26827272772789,
+      "rewards/accuracy_reward/mean": 0.234375,
+      "rewards/accuracy_reward/std": 0.42527204751968384,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999532699584961,
+      "sampling/importance_sampling_ratio/min": 4.540153167909011e-05,
+      "sampling/sampling_logp_difference/max": 9.999964714050293,
+      "sampling/sampling_logp_difference/mean": 0.02002539485692978,
+      "step": 37
+    },
+    {
+      "clip_ratio/high_max": 1.5225089100567857e-05,
+      "clip_ratio/high_mean": 6.960676159906143e-06,
+      "clip_ratio/low_mean": 4.09088329433871e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.7869508762232726e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16361.0,
+      "completions/mean_length": 6413.421875,
+      "completions/mean_terminated_length": 6174.12841796875,
+      "completions/min_length": 2.0,
+      "completions/min_terminated_length": 2.0,
+      "entropy": 0.9452399462461472,
+      "epoch": 0.034958601655933765,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0021800603717565536,
+      "learning_rate": 1e-05,
+      "loss": 0.0275,
+      "num_tokens": 27652757.0,
+      "reward": 0.296875,
+      "reward_std": 0.31246688961982727,
+      "rewards/accuracy_reward/mean": 0.296875,
+      "rewards/accuracy_reward/std": 0.45867621898651123,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999439120292664,
+      "sampling/importance_sampling_ratio/min": 3.895394547726028e-05,
+      "sampling/sampling_logp_difference/max": 10.153130531311035,
+      "sampling/sampling_logp_difference/mean": 0.019722118973731995,
+      "step": 38
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 3.9564903318023426e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.9564903318023426e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15754.0,
+      "completions/max_terminated_length": 15754.0,
+      "completions/mean_length": 5176.3515625,
+      "completions/mean_terminated_length": 5176.3515625,
+      "completions/min_length": 130.0,
+      "completions/min_terminated_length": 130.0,
+      "entropy": 1.0444758981466293,
+      "epoch": 0.035878564857405704,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.004153470974415541,
+      "learning_rate": 1e-05,
+      "loss": 0.0798,
+      "num_tokens": 28334386.0,
+      "reward": 0.2734375,
+      "reward_std": 0.32035762071609497,
+      "rewards/accuracy_reward/mean": 0.2734375,
+      "rewards/accuracy_reward/std": 0.447474867105484,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999774694442749,
+      "sampling/importance_sampling_ratio/min": 0.007421077694743872,
+      "sampling/sampling_logp_difference/max": 4.903430938720703,
+      "sampling/sampling_logp_difference/mean": 0.020159056410193443,
+      "step": 39
+    },
+    {
+      "clip_ratio/high_max": 1.725743459246587e-05,
+      "clip_ratio/high_mean": 4.3143586481164675e-06,
+      "clip_ratio/low_mean": 2.0204584302518924e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.451894306432223e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15554.0,
+      "completions/mean_length": 5178.9921875,
+      "completions/mean_terminated_length": 5001.13525390625,
+      "completions/min_length": 4.0,
+      "completions/min_terminated_length": 4.0,
+      "entropy": 1.0803537145256996,
+      "epoch": 0.03679852805887764,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.002477057045325637,
+      "learning_rate": 1e-05,
+      "loss": 0.0067,
+      "num_tokens": 29017145.0,
+      "reward": 0.2890625,
+      "reward_std": 0.29932135343551636,
+      "rewards/accuracy_reward/mean": 0.2890625,
+      "rewards/accuracy_reward/std": 0.45510825514793396,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000497102737427,
+      "sampling/importance_sampling_ratio/min": 0.004630985204130411,
+      "sampling/sampling_logp_difference/max": 5.374985694885254,
+      "sampling/sampling_logp_difference/mean": 0.019826076924800873,
+      "step": 40
+    },
+    {
+      "clip_ratio/high_max": 1.6637992303003557e-05,
+      "clip_ratio/high_mean": 4.159498075750889e-06,
+      "clip_ratio/low_mean": 2.1970684144889674e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.6130182106953725e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 14131.0,
+      "completions/max_terminated_length": 14131.0,
+      "completions/mean_length": 4980.359375,
+      "completions/mean_terminated_length": 4980.359375,
+      "completions/min_length": 329.0,
+      "completions/min_terminated_length": 329.0,
+      "entropy": 0.9510642662644386,
+      "epoch": 0.03771849126034959,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0016275218222290277,
+      "learning_rate": 1e-05,
+      "loss": -0.0097,
+      "num_tokens": 29673535.0,
+      "reward": 0.4375,
+      "reward_std": 0.26249876618385315,
+      "rewards/accuracy_reward/mean": 0.4375,
+      "rewards/accuracy_reward/std": 0.49802759289741516,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999750852584839,
+      "sampling/importance_sampling_ratio/min": 0.000599516904912889,
+      "sampling/sampling_logp_difference/max": 7.419386386871338,
+      "sampling/sampling_logp_difference/mean": 0.01844976656138897,
+      "step": 41
+    },
+    {
+      "clip_ratio/high_max": 2.8087193186365766e-05,
+      "clip_ratio/high_mean": 7.021798296591442e-06,
+      "clip_ratio/low_mean": 3.9683913541921356e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.670571286169434e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15328.0,
+      "completions/mean_length": 5778.6953125,
+      "completions/mean_terminated_length": 5695.18896484375,
+      "completions/min_length": 691.0,
+      "completions/min_terminated_length": 691.0,
+      "entropy": 1.0413239300251007,
+      "epoch": 0.03863845446182153,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.001847646082751453,
+      "learning_rate": 1e-05,
+      "loss": -0.0045,
+      "num_tokens": 30436416.0,
+      "reward": 0.2578125,
+      "reward_std": 0.33903977274894714,
+      "rewards/accuracy_reward/mean": 0.2578125,
+      "rewards/accuracy_reward/std": 0.43914902210235596,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998501539230347,
+      "sampling/importance_sampling_ratio/min": 0.00020348970429040492,
+      "sampling/sampling_logp_difference/max": 8.499895095825195,
+      "sampling/sampling_logp_difference/mean": 0.021502099931240082,
+      "step": 42
+    },
+    {
+      "clip_ratio/high_max": 2.68402091023745e-05,
+      "clip_ratio/high_mean": 8.575278570788214e-06,
+      "clip_ratio/low_mean": 4.547183698377921e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.404711600931478e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 14182.0,
+      "completions/max_terminated_length": 14182.0,
+      "completions/mean_length": 4875.125,
+      "completions/mean_terminated_length": 4875.125,
+      "completions/min_length": 349.0,
+      "completions/min_terminated_length": 349.0,
+      "entropy": 1.0464690178632736,
+      "epoch": 0.03955841766329347,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0021134833805263042,
+      "learning_rate": 1e-05,
+      "loss": 0.0727,
+      "num_tokens": 31083672.0,
+      "reward": 0.40625,
+      "reward_std": 0.3584783971309662,
+      "rewards/accuracy_reward/mean": 0.40625,
+      "rewards/accuracy_reward/std": 0.4930621087551117,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999340176582336,
+      "sampling/importance_sampling_ratio/min": 0.012113225646317005,
+      "sampling/sampling_logp_difference/max": 4.41345739364624,
+      "sampling/sampling_logp_difference/mean": 0.019140049815177917,
+      "step": 43
+    },
+    {
+      "clip_ratio/high_max": 3.9877967992651975e-05,
+      "clip_ratio/high_mean": 9.969491998162994e-06,
+      "clip_ratio/low_mean": 3.981287841270387e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.9782369273998484e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15959.0,
+      "completions/mean_length": 4691.421875,
+      "completions/mean_terminated_length": 4505.82568359375,
+      "completions/min_length": 296.0,
+      "completions/min_terminated_length": 296.0,
+      "entropy": 1.0229775309562683,
+      "epoch": 0.040478380864765406,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0037735572550445795,
+      "learning_rate": 1e-05,
+      "loss": 0.0603,
+      "num_tokens": 31703654.0,
+      "reward": 0.4453125,
+      "reward_std": 0.2993389964103699,
+      "rewards/accuracy_reward/mean": 0.4453125,
+      "rewards/accuracy_reward/std": 0.4989531338214874,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999492168426514,
+      "sampling/importance_sampling_ratio/min": 0.03150063753128052,
+      "sampling/sampling_logp_difference/max": 3.457747459411621,
+      "sampling/sampling_logp_difference/mean": 0.01912039890885353,
+      "step": 44
+    },
+    {
+      "clip_ratio/high_max": 3.5441889849607833e-06,
+      "clip_ratio/high_mean": 8.860472462401958e-07,
+      "clip_ratio/low_mean": 1.5137359810069029e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 1.6023407056309225e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15605.0,
+      "completions/mean_length": 6821.96875,
+      "completions/mean_terminated_length": 6592.48046875,
+      "completions/min_length": 1196.0,
+      "completions/min_terminated_length": 1196.0,
+      "entropy": 1.1132484003901482,
+      "epoch": 0.04139834406623735,
+      "frac_reward_zero_std": 0.625,
+      "grad_norm": 0.0010448681423440576,
+      "learning_rate": 1e-05,
+      "loss": 0.022,
+      "num_tokens": 32599778.0,
+      "reward": 0.2265625,
+      "reward_std": 0.1814819872379303,
+      "rewards/accuracy_reward/mean": 0.2265625,
+      "rewards/accuracy_reward/std": 0.4202519655227661,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999915361404419,
+      "sampling/importance_sampling_ratio/min": 0.006500681862235069,
+      "sampling/sampling_logp_difference/max": 5.035848140716553,
+      "sampling/sampling_logp_difference/mean": 0.02125459350645542,
+      "step": 45
+    },
+    {
+      "clip_ratio/high_max": 4.652893949241843e-06,
+      "clip_ratio/high_mean": 1.1632234873104608e-06,
+      "clip_ratio/low_mean": 5.731516603191267e-05,
+      "clip_ratio/low_min": 9.891066838463303e-06,
+      "clip_ratio/region_mean": 5.8478389746596804e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15753.0,
+      "completions/mean_length": 6834.3671875,
+      "completions/mean_terminated_length": 6605.17626953125,
+      "completions/min_length": 624.0,
+      "completions/min_terminated_length": 624.0,
+      "entropy": 0.9827468693256378,
+      "epoch": 0.04231830726770929,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.0017670176457613707,
+      "learning_rate": 1e-05,
+      "loss": 0.1105,
+      "num_tokens": 33492737.0,
+      "reward": 0.3046875,
+      "reward_std": 0.3440523147583008,
+      "rewards/accuracy_reward/mean": 0.3046875,
+      "rewards/accuracy_reward/std": 0.46208351850509644,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999089241027832,
+      "sampling/importance_sampling_ratio/min": 0.0021202093921601772,
+      "sampling/sampling_logp_difference/max": 6.156240463256836,
+      "sampling/sampling_logp_difference/mean": 0.019490526989102364,
+      "step": 46
+    },
+    {
+      "clip_ratio/high_max": 6.717360520269722e-06,
+      "clip_ratio/high_mean": 2.503530367903295e-06,
+      "clip_ratio/low_mean": 2.5672919832686603e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.8176450200589898e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14098.0,
+      "completions/mean_length": 6175.296875,
+      "completions/mean_terminated_length": 5845.98388671875,
+      "completions/min_length": 558.0,
+      "completions/min_terminated_length": 558.0,
+      "entropy": 1.1584237962961197,
+      "epoch": 0.04323827046918123,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.0016891945851966739,
+      "learning_rate": 1e-05,
+      "loss": -0.0008,
+      "num_tokens": 34312455.0,
+      "reward": 0.1875,
+      "reward_std": 0.19673937559127808,
+      "rewards/accuracy_reward/mean": 0.1875,
+      "rewards/accuracy_reward/std": 0.39184603095054626,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999643564224243,
+      "sampling/importance_sampling_ratio/min": 8.086384332273155e-05,
+      "sampling/sampling_logp_difference/max": 9.422743797302246,
+      "sampling/sampling_logp_difference/mean": 0.021749887615442276,
+      "step": 47
+    },
+    {
+      "clip_ratio/high_max": 2.2362002255249536e-05,
+      "clip_ratio/high_mean": 8.189798336388776e-06,
+      "clip_ratio/low_mean": 2.1058204993096297e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.9248002192616696e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16054.0,
+      "completions/mean_length": 6036.8359375,
+      "completions/mean_terminated_length": 5955.3623046875,
+      "completions/min_length": 510.0,
+      "completions/min_terminated_length": 510.0,
+      "entropy": 0.9301538467407227,
+      "epoch": 0.04415823367065318,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.003834392176941037,
+      "learning_rate": 1e-05,
+      "loss": 0.0636,
+      "num_tokens": 35102738.0,
+      "reward": 0.4375,
+      "reward_std": 0.36614155769348145,
+      "rewards/accuracy_reward/mean": 0.4375,
+      "rewards/accuracy_reward/std": 0.49802759289741516,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998494386672974,
+      "sampling/importance_sampling_ratio/min": 0.00013992394087836146,
+      "sampling/sampling_logp_difference/max": 8.874411582946777,
+      "sampling/sampling_logp_difference/mean": 0.019147861748933792,
+      "step": 48
+    },
+    {
+      "clip_ratio/high_max": 1.1501961580506759e-05,
+      "clip_ratio/high_mean": 2.8754903951266897e-06,
+      "clip_ratio/low_mean": 4.08189714562468e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.369446196506033e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15594.0,
+      "completions/mean_length": 6262.46875,
+      "completions/mean_terminated_length": 5764.68798828125,
+      "completions/min_length": 210.0,
+      "completions/min_terminated_length": 210.0,
+      "entropy": 0.8599015846848488,
+      "epoch": 0.045078196872125116,
+      "frac_reward_zero_std": 0.125,
+      "grad_norm": 0.0029804729856550694,
+      "learning_rate": 1e-05,
+      "loss": 0.0495,
+      "num_tokens": 35924886.0,
+      "reward": 0.3984375,
+      "reward_std": 0.3911295533180237,
+      "rewards/accuracy_reward/mean": 0.3984375,
+      "rewards/accuracy_reward/std": 0.4915000796318054,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999922513961792,
+      "sampling/importance_sampling_ratio/min": 0.00021375219512265176,
+      "sampling/sampling_logp_difference/max": 9.904524803161621,
+      "sampling/sampling_logp_difference/mean": 0.01815103553235531,
+      "step": 49
+    },
+    {
+      "clip_ratio/high_max": 2.4107544049911667e-05,
+      "clip_ratio/high_mean": 6.026886012477917e-06,
+      "clip_ratio/low_mean": 3.6588148361715866e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.261503391944643e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 14556.0,
+      "completions/max_terminated_length": 14556.0,
+      "completions/mean_length": 5926.8984375,
+      "completions/mean_terminated_length": 5926.8984375,
+      "completions/min_length": 346.0,
+      "completions/min_terminated_length": 346.0,
+      "entropy": 1.0042993426322937,
+      "epoch": 0.045998160073597055,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0022071697749197483,
+      "learning_rate": 1e-05,
+      "loss": 0.0059,
+      "num_tokens": 36700913.0,
+      "reward": 0.3359375,
+      "reward_std": 0.3306073546409607,
+      "rewards/accuracy_reward/mean": 0.3359375,
+      "rewards/accuracy_reward/std": 0.47417303919792175,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.000010371208191,
+      "sampling/importance_sampling_ratio/min": 0.0005220364546403289,
+      "sampling/sampling_logp_difference/max": 7.557773113250732,
+      "sampling/sampling_logp_difference/mean": 0.01954064890742302,
+      "step": 50
+    },
+    {
+      "clip_ratio/high_max": 4.9106265578302555e-06,
+      "clip_ratio/high_mean": 1.2276566394575639e-06,
+      "clip_ratio/low_mean": 2.634599570683349e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.7573652346291055e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15217.0,
+      "completions/mean_length": 6873.6875,
+      "completions/mean_terminated_length": 6645.4404296875,
+      "completions/min_length": 505.0,
+      "completions/min_terminated_length": 505.0,
+      "entropy": 1.0255412608385086,
+      "epoch": 0.046918123275068994,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.002320924773812294,
+      "learning_rate": 1e-05,
+      "loss": 0.0508,
+      "num_tokens": 37604865.0,
+      "reward": 0.234375,
+      "reward_std": 0.3135228157043457,
+      "rewards/accuracy_reward/mean": 0.234375,
+      "rewards/accuracy_reward/std": 0.42527204751968384,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999098777770996,
+      "sampling/importance_sampling_ratio/min": 0.026153141632676125,
+      "sampling/sampling_logp_difference/max": 3.6437859535217285,
+      "sampling/sampling_logp_difference/mean": 0.019532475620508194,
+      "step": 51
+    },
+    {
+      "clip_ratio/high_max": 1.6350510122720152e-05,
+      "clip_ratio/high_mean": 4.087627530680038e-06,
+      "clip_ratio/low_mean": 2.351988746340794e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.7607515221461654e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15668.0,
+      "completions/mean_length": 6073.8984375,
+      "completions/mean_terminated_length": 5992.71630859375,
+      "completions/min_length": 654.0,
+      "completions/min_terminated_length": 654.0,
+      "entropy": 1.0713753998279572,
+      "epoch": 0.04783808647654094,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.002212709980085492,
+      "learning_rate": 1e-05,
+      "loss": 0.0668,
+      "num_tokens": 38405196.0,
+      "reward": 0.359375,
+      "reward_std": 0.22119548916816711,
+      "rewards/accuracy_reward/mean": 0.359375,
+      "rewards/accuracy_reward/std": 0.481702595949173,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998978972434998,
+      "sampling/importance_sampling_ratio/min": 8.706459084351081e-06,
+      "sampling/sampling_logp_difference/max": 11.651445388793945,
+      "sampling/sampling_logp_difference/mean": 0.021252838894724846,
+      "step": 52
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 3.729486718384578e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.729486718384578e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15299.0,
+      "completions/mean_length": 5838.71875,
+      "completions/mean_terminated_length": 5671.33349609375,
+      "completions/min_length": 331.0,
+      "completions/min_terminated_length": 331.0,
+      "entropy": 1.021155133843422,
+      "epoch": 0.04875804967801288,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.001135052996687591,
+      "learning_rate": 1e-05,
+      "loss": 0.0178,
+      "num_tokens": 39171704.0,
+      "reward": 0.28125,
+      "reward_std": 0.23410367965698242,
+      "rewards/accuracy_reward/mean": 0.28125,
+      "rewards/accuracy_reward/std": 0.4513758420944214,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999173879623413,
+      "sampling/importance_sampling_ratio/min": 0.003084881929680705,
+      "sampling/sampling_logp_difference/max": 5.7812418937683105,
+      "sampling/sampling_logp_difference/mean": 0.020781882107257843,
+      "step": 53
+    },
+    {
+      "clip_ratio/high_max": 1.7124169744420215e-05,
+      "clip_ratio/high_mean": 4.281042436105054e-06,
+      "clip_ratio/low_mean": 3.706903294187214e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.135007543482061e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 14617.0,
+      "completions/max_terminated_length": 14617.0,
+      "completions/mean_length": 6358.5859375,
+      "completions/mean_terminated_length": 6358.5859375,
+      "completions/min_length": 940.0,
+      "completions/min_terminated_length": 940.0,
+      "entropy": 0.9720487147569656,
+      "epoch": 0.04967801287948482,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.002638082252815366,
+      "learning_rate": 1e-05,
+      "loss": 0.0145,
+      "num_tokens": 40003859.0,
+      "reward": 0.40625,
+      "reward_std": 0.3174618184566498,
+      "rewards/accuracy_reward/mean": 0.40625,
+      "rewards/accuracy_reward/std": 0.4930621087551117,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000380277633667,
+      "sampling/importance_sampling_ratio/min": 0.01960253342986107,
+      "sampling/sampling_logp_difference/max": 3.932096481323242,
+      "sampling/sampling_logp_difference/mean": 0.01991666667163372,
+      "step": 54
+    },
+    {
+      "clip_ratio/high_max": 6.55582925901399e-06,
+      "clip_ratio/high_mean": 2.994117721755174e-06,
+      "clip_ratio/low_mean": 2.222621503733535e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.5220332759090525e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 14753.0,
+      "completions/max_terminated_length": 14753.0,
+      "completions/mean_length": 4634.1875,
+      "completions/mean_terminated_length": 4634.1875,
+      "completions/min_length": 266.0,
+      "completions/min_terminated_length": 266.0,
+      "entropy": 0.9715309366583824,
+      "epoch": 0.050597976080956765,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.001994960242882371,
+      "learning_rate": 1e-05,
+      "loss": 0.0221,
+      "num_tokens": 40616483.0,
+      "reward": 0.4375,
+      "reward_std": 0.29644322395324707,
+      "rewards/accuracy_reward/mean": 0.4375,
+      "rewards/accuracy_reward/std": 0.49802759289741516,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000698566436768,
+      "sampling/importance_sampling_ratio/min": 1.0510009815334342e-05,
+      "sampling/sampling_logp_difference/max": 11.46318244934082,
+      "sampling/sampling_logp_difference/mean": 0.01902047172188759,
+      "step": 55
+    },
+    {
+      "clip_ratio/high_max": 2.2474248908110894e-05,
+      "clip_ratio/high_mean": 7.571314540655294e-06,
+      "clip_ratio/low_mean": 4.3583780325207044e-05,
+      "clip_ratio/low_min": 4.6013396968191955e-06,
+      "clip_ratio/region_mean": 5.1155094070054474e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15953.0,
+      "completions/mean_length": 6596.25,
+      "completions/mean_terminated_length": 6361.34423828125,
+      "completions/min_length": 318.0,
+      "completions/min_terminated_length": 318.0,
+      "entropy": 0.8207943215966225,
+      "epoch": 0.051517939282428704,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0019902780186384916,
+      "learning_rate": 1e-05,
+      "loss": 0.0506,
+      "num_tokens": 41484443.0,
+      "reward": 0.4453125,
+      "reward_std": 0.326668381690979,
+      "rewards/accuracy_reward/mean": 0.4453125,
+      "rewards/accuracy_reward/std": 0.4989531338214874,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.000016689300537,
+      "sampling/importance_sampling_ratio/min": 7.485233072657138e-05,
+      "sampling/sampling_logp_difference/max": 9.499993324279785,
+      "sampling/sampling_logp_difference/mean": 0.018301833420991898,
+      "step": 56
+    },
+    {
+      "clip_ratio/high_max": 3.0019932637515012e-06,
+      "clip_ratio/high_mean": 7.504983159378753e-07,
+      "clip_ratio/low_mean": 4.332785601945943e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.407835376696312e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15834.0,
+      "completions/mean_length": 6785.75,
+      "completions/mean_terminated_length": 6313.70458984375,
+      "completions/min_length": 393.0,
+      "completions/min_terminated_length": 393.0,
+      "entropy": 0.9876058474183083,
+      "epoch": 0.05243790248390064,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0015235114842653275,
+      "learning_rate": 1e-05,
+      "loss": 0.0128,
+      "num_tokens": 42372235.0,
+      "reward": 0.2421875,
+      "reward_std": 0.325075626373291,
+      "rewards/accuracy_reward/mean": 0.2421875,
+      "rewards/accuracy_reward/std": 0.4300905168056488,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999551773071289,
+      "sampling/importance_sampling_ratio/min": 0.026679370552301407,
+      "sampling/sampling_logp_difference/max": 3.6238646507263184,
+      "sampling/sampling_logp_difference/mean": 0.019945615902543068,
+      "step": 57
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 2.1349006601667497e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.1349006601667497e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14726.0,
+      "completions/mean_length": 4881.2109375,
+      "completions/mean_terminated_length": 4510.1533203125,
+      "completions/min_length": 437.0,
+      "completions/min_terminated_length": 437.0,
+      "entropy": 0.989942155778408,
+      "epoch": 0.05335786568537258,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.002033712575212121,
+      "learning_rate": 1e-05,
+      "loss": 0.1088,
+      "num_tokens": 43015238.0,
+      "reward": 0.4375,
+      "reward_std": 0.2869548797607422,
+      "rewards/accuracy_reward/mean": 0.4375,
+      "rewards/accuracy_reward/std": 0.49802759289741516,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000300407409668,
+      "sampling/importance_sampling_ratio/min": 0.0001238943514181301,
+      "sampling/sampling_logp_difference/max": 8.996081352233887,
+      "sampling/sampling_logp_difference/mean": 0.01887543685734272,
+      "step": 58
+    },
+    {
+      "clip_ratio/high_max": 2.584004687378183e-05,
+      "clip_ratio/high_mean": 6.4600117184454575e-06,
+      "clip_ratio/low_mean": 2.1371045761497953e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.7831058105221018e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15001.0,
+      "completions/max_terminated_length": 15001.0,
+      "completions/mean_length": 4725.3984375,
+      "completions/mean_terminated_length": 4725.3984375,
+      "completions/min_length": 117.0,
+      "completions/min_terminated_length": 117.0,
+      "entropy": 1.0350637435913086,
+      "epoch": 0.05427782888684453,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0030296226032078266,
+      "learning_rate": 1e-05,
+      "loss": 0.0691,
+      "num_tokens": 43637737.0,
+      "reward": 0.4453125,
+      "reward_std": 0.32035762071609497,
+      "rewards/accuracy_reward/mean": 0.4453125,
+      "rewards/accuracy_reward/std": 0.4989531338214874,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999939203262329,
+      "sampling/importance_sampling_ratio/min": 0.00022932067804504186,
+      "sampling/sampling_logp_difference/max": 8.380389213562012,
+      "sampling/sampling_logp_difference/mean": 0.01995944231748581,
+      "step": 59
+    },
+    {
+      "clip_ratio/high_max": 1.994733975152485e-05,
+      "clip_ratio/high_mean": 4.986834937881213e-06,
+      "clip_ratio/low_mean": 3.5168303838872816e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.015513832200668e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16240.0,
+      "completions/mean_length": 4918.171875,
+      "completions/mean_terminated_length": 4736.1748046875,
+      "completions/min_length": 388.0,
+      "completions/min_terminated_length": 388.0,
+      "entropy": 0.965274304151535,
+      "epoch": 0.05519779208831647,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.002758471528068185,
+      "learning_rate": 1e-05,
+      "loss": 0.0845,
+      "num_tokens": 44285327.0,
+      "reward": 0.328125,
+      "reward_std": 0.27328526973724365,
+      "rewards/accuracy_reward/mean": 0.328125,
+      "rewards/accuracy_reward/std": 0.4713755249977112,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999663233757019,
+      "sampling/importance_sampling_ratio/min": 0.010958661325275898,
+      "sampling/sampling_logp_difference/max": 4.513625144958496,
+      "sampling/sampling_logp_difference/mean": 0.019083233550190926,
+      "step": 60
+    },
+    {
+      "clip_ratio/high_max": 1.0621563887980301e-05,
+      "clip_ratio/high_mean": 2.6553909719950752e-06,
+      "clip_ratio/low_mean": 3.838553107016196e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.1040922042157035e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15031.0,
+      "completions/mean_length": 4998.2890625,
+      "completions/mean_terminated_length": 4908.6376953125,
+      "completions/min_length": 524.0,
+      "completions/min_terminated_length": 524.0,
+      "entropy": 0.9200445115566254,
+      "epoch": 0.05611775528978841,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.0027611786499619484,
+      "learning_rate": 1e-05,
+      "loss": 0.0575,
+      "num_tokens": 44944356.0,
+      "reward": 0.3515625,
+      "reward_std": 0.3895368278026581,
+      "rewards/accuracy_reward/mean": 0.3515625,
+      "rewards/accuracy_reward/std": 0.4793342351913452,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999884366989136,
+      "sampling/importance_sampling_ratio/min": 0.0018651526188477874,
+      "sampling/sampling_logp_difference/max": 6.284412384033203,
+      "sampling/sampling_logp_difference/mean": 0.017853498458862305,
+      "step": 61
+    },
+    {
+      "clip_ratio/high_max": 1.0136624496226432e-05,
+      "clip_ratio/high_mean": 2.534156124056608e-06,
+      "clip_ratio/low_mean": 2.0260404085092887e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.2794560095462657e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16110.0,
+      "completions/mean_length": 6290.1796875,
+      "completions/mean_terminated_length": 6129.96044921875,
+      "completions/min_length": 302.0,
+      "completions/min_terminated_length": 302.0,
+      "entropy": 0.9360214695334435,
+      "epoch": 0.05703771849126035,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0015557854203507304,
+      "learning_rate": 1e-05,
+      "loss": 0.0111,
+      "num_tokens": 45767867.0,
+      "reward": 0.34375,
+      "reward_std": 0.30168038606643677,
+      "rewards/accuracy_reward/mean": 0.34375,
+      "rewards/accuracy_reward/std": 0.47682511806488037,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999427795410156,
+      "sampling/importance_sampling_ratio/min": 0.0011004531988874078,
+      "sampling/sampling_logp_difference/max": 6.812033176422119,
+      "sampling/sampling_logp_difference/mean": 0.0200855303555727,
+      "step": 62
+    },
+    {
+      "clip_ratio/high_max": 2.2559511307918e-06,
+      "clip_ratio/high_mean": 5.6398778269795e-07,
+      "clip_ratio/low_mean": 4.51761221711422e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.574010984015331e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16366.0,
+      "completions/mean_length": 6486.15625,
+      "completions/mean_terminated_length": 6248.6083984375,
+      "completions/min_length": 77.0,
+      "completions/min_terminated_length": 77.0,
+      "entropy": 0.863138921558857,
+      "epoch": 0.05795768169273229,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0026953541673719883,
+      "learning_rate": 1e-05,
+      "loss": -0.0194,
+      "num_tokens": 46618575.0,
+      "reward": 0.2578125,
+      "reward_std": 0.2580180764198303,
+      "rewards/accuracy_reward/mean": 0.2578125,
+      "rewards/accuracy_reward/std": 0.43914902210235596,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999406337738037,
+      "sampling/importance_sampling_ratio/min": 0.0011708897072821856,
+      "sampling/sampling_logp_difference/max": 6.749991416931152,
+      "sampling/sampling_logp_difference/mean": 0.01863238587975502,
+      "step": 63
+    },
+    {
+      "clip_ratio/high_max": 1.0073357771034352e-05,
+      "clip_ratio/high_mean": 2.518339442758588e-06,
+      "clip_ratio/low_mean": 2.787370635815023e-05,
+      "clip_ratio/low_min": 3.837534222839167e-06,
+      "clip_ratio/region_mean": 3.0392045573535142e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16010.0,
+      "completions/mean_length": 6442.7734375,
+      "completions/mean_terminated_length": 6284.9765625,
+      "completions/min_length": 776.0,
+      "completions/min_terminated_length": 776.0,
+      "entropy": 1.0242054909467697,
+      "epoch": 0.05887764489420423,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0024442619178444147,
+      "learning_rate": 1e-05,
+      "loss": 0.0569,
+      "num_tokens": 47462274.0,
+      "reward": 0.328125,
+      "reward_std": 0.2777610421180725,
+      "rewards/accuracy_reward/mean": 0.328125,
+      "rewards/accuracy_reward/std": 0.4713755249977112,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998892545700073,
+      "sampling/importance_sampling_ratio/min": 4.9445447736218284e-09,
+      "sampling/sampling_logp_difference/max": 19.124980926513672,
+      "sampling/sampling_logp_difference/mean": 0.019810764119029045,
+      "step": 64
+    }
+  ],
+  "logging_steps": 1,
+  "max_steps": 1024,
+  "num_input_tokens_seen": 47462274,
+  "num_train_epochs": 1,
+  "save_steps": 64,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 0.0,
+  "train_batch_size": 4,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/dapo_lora_plus_20251202_001141/output.log b/dapo_lora_plus_20251202_001141/output.log
new file mode 100644
index 0000000000000000000000000000000000000000..a5a9eaf66ed4b3f66a8b5d1772df9fd85210dee8
--- /dev/null
+++ b/dapo_lora_plus_20251202_001141/output.log
@@ -0,0 +1,6407 @@
+W1202 00:12:01.875000 96731 torch/distributed/run.py:774] 
+W1202 00:12:01.875000 96731 torch/distributed/run.py:774] *****************************************
+W1202 00:12:01.875000 96731 torch/distributed/run.py:774] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. 
+W1202 00:12:01.875000 96731 torch/distributed/run.py:774] *****************************************
+INFO 12-02 00:12:24 [__init__.py:216] Automatically detected platform cuda.
+INFO 12-02 00:12:24 [__init__.py:216] Automatically detected platform cuda.
+INFO 12-02 00:12:24 [__init__.py:216] Automatically detected platform cuda.
+INFO 12-02 00:12:24 [__init__.py:216] Automatically detected platform cuda.
+TrainConfig(common=CommonConfig(seed=42, debug=False), model=ModelConfig(model_name_or_path='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', dtype='bfloat16'), peft=PeftConfig(type='lora_plus', use_peft=True, task_type='CAUSAL_LM', r=32, lora_alpha=64, lora_dropout=0.05, total_step=1000, target_modules=['q_proj', 'v_proj', 'k_proj', 'o_proj', 'up_proj', 'down_proj', 'gate_proj']), training=TrainingConfig(learning_rate=1e-05, output_dir='outputs/dapo_lora_plus_20251202_001141', run_name='outputs/dapo_lora_plus_20251202_001141', remove_unused_columns=False, gradient_accumulation_steps=8, num_train_epochs=1, max_completion_length=16384, num_generations=8, max_prompt_length=512, logging_steps=1, save_strategy='steps', save_steps=64, max_steps=1024, use_vllm=True, vllm_mode='colocate', vllm_gpu_memory_utilization=0.4, use_liger_kernel=False, epsilon_high=0.28, lr_scheduler_type='constant', lr_scheduler_kwargs={'min_lr_rate': 0.1}, loss_type='dapo', report_to=['wandb'], beta=0.0, warmup_ratio=0.0, per_device_train_batch_size=4, top_entropy_quantile=1.0), logging=LoggingConfig(trackio_space_id='Open-Tinker/Open-Tinker', trackio_project='grpo-full-qwen3-4b', wandb_project='grpo-full-qwen3-4b'), dataset=DatasetConfig(dataset_name_or_path='open-r1/DAPO-Math-17k-Processed', example_numbers=1000000000))TrainConfig(common=CommonConfig(seed=42, debug=False), model=ModelConfig(model_name_or_path='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', dtype='bfloat16'), peft=PeftConfig(type='lora_plus', use_peft=True, task_type='CAUSAL_LM', r=32, lora_alpha=64, lora_dropout=0.05, total_step=1000, target_modules=['q_proj', 'v_proj', 'k_proj', 'o_proj', 'up_proj', 'down_proj', 'gate_proj']), training=TrainingConfig(learning_rate=1e-05, output_dir='outputs/dapo_lora_plus_20251202_001141', run_name='outputs/dapo_lora_plus_20251202_001141', remove_unused_columns=False, gradient_accumulation_steps=8, num_train_epochs=1, max_completion_length=16384, num_generations=8, max_prompt_length=512, logging_steps=1, save_strategy='steps', save_steps=64, max_steps=1024, use_vllm=True, vllm_mode='colocate', vllm_gpu_memory_utilization=0.4, use_liger_kernel=False, epsilon_high=0.28, lr_scheduler_type='constant', lr_scheduler_kwargs={'min_lr_rate': 0.1}, loss_type='dapo', report_to=['wandb'], beta=0.0, warmup_ratio=0.0, per_device_train_batch_size=4, top_entropy_quantile=1.0), logging=LoggingConfig(trackio_space_id='Open-Tinker/Open-Tinker', trackio_project='grpo-full-qwen3-4b', wandb_project='grpo-full-qwen3-4b'), dataset=DatasetConfig(dataset_name_or_path='open-r1/DAPO-Math-17k-Processed', example_numbers=1000000000))
+
+TrainConfig(common=CommonConfig(seed=42, debug=False), model=ModelConfig(model_name_or_path='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', dtype='bfloat16'), peft=PeftConfig(type='lora_plus', use_peft=True, task_type='CAUSAL_LM', r=32, lora_alpha=64, lora_dropout=0.05, total_step=1000, target_modules=['q_proj', 'v_proj', 'k_proj', 'o_proj', 'up_proj', 'down_proj', 'gate_proj']), training=TrainingConfig(learning_rate=1e-05, output_dir='outputs/dapo_lora_plus_20251202_001141', run_name='outputs/dapo_lora_plus_20251202_001141', remove_unused_columns=False, gradient_accumulation_steps=8, num_train_epochs=1, max_completion_length=16384, num_generations=8, max_prompt_length=512, logging_steps=1, save_strategy='steps', save_steps=64, max_steps=1024, use_vllm=True, vllm_mode='colocate', vllm_gpu_memory_utilization=0.4, use_liger_kernel=False, epsilon_high=0.28, lr_scheduler_type='constant', lr_scheduler_kwargs={'min_lr_rate': 0.1}, loss_type='dapo', report_to=['wandb'], beta=0.0, warmup_ratio=0.0, per_device_train_batch_size=4, top_entropy_quantile=1.0), logging=LoggingConfig(trackio_space_id='Open-Tinker/Open-Tinker', trackio_project='grpo-full-qwen3-4b', wandb_project='grpo-full-qwen3-4b'), dataset=DatasetConfig(dataset_name_or_path='open-r1/DAPO-Math-17k-Processed', example_numbers=1000000000))
+TrainConfig(common=CommonConfig(seed=42, debug=False), model=ModelConfig(model_name_or_path='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', dtype='bfloat16'), peft=PeftConfig(type='lora_plus', use_peft=True, task_type='CAUSAL_LM', r=32, lora_alpha=64, lora_dropout=0.05, total_step=1000, target_modules=['q_proj', 'v_proj', 'k_proj', 'o_proj', 'up_proj', 'down_proj', 'gate_proj']), training=TrainingConfig(learning_rate=1e-05, output_dir='outputs/dapo_lora_plus_20251202_001141', run_name='outputs/dapo_lora_plus_20251202_001141', remove_unused_columns=False, gradient_accumulation_steps=8, num_train_epochs=1, max_completion_length=16384, num_generations=8, max_prompt_length=512, logging_steps=1, save_strategy='steps', save_steps=64, max_steps=1024, use_vllm=True, vllm_mode='colocate', vllm_gpu_memory_utilization=0.4, use_liger_kernel=False, epsilon_high=0.28, lr_scheduler_type='constant', lr_scheduler_kwargs={'min_lr_rate': 0.1}, loss_type='dapo', report_to=['wandb'], beta=0.0, warmup_ratio=0.0, per_device_train_batch_size=4, top_entropy_quantile=1.0), logging=LoggingConfig(trackio_space_id='Open-Tinker/Open-Tinker', trackio_project='grpo-full-qwen3-4b', wandb_project='grpo-full-qwen3-4b'), dataset=DatasetConfig(dataset_name_or_path='open-r1/DAPO-Math-17k-Processed', example_numbers=1000000000))
+[OpenTinker] 2025-12-02 00:12:29,815 - root - INFO - Output directory outputs/dapo_lora_plus_20251202_001141 already exists, using it
+[OpenTinker] 2025-12-02 00:12:29,815 - root - INFO - Output directory outputs/dapo_lora_plus_20251202_001141 already exists, using it
+[OpenTinker] 2025-12-02 00:12:29,815 - root - INFO - Output directory outputs/dapo_lora_plus_20251202_001141 already exists, using it
+[OpenTinker] 2025-12-02 00:12:29,816 - root - INFO - Output directory outputs/dapo_lora_plus_20251202_001141 already exists, using it
+wandb: Currently logged in as: mikastars (mikastars-zhejiang-university) to https://api.wandb.ai. Use `wandb login --relogin` to force relogin
+wandb: Currently logged in as: mikastars (mikastars-zhejiang-university) to https://api.wandb.ai. Use `wandb login --relogin` to force relogin
+wandb: Currently logged in as: mikastars (mikastars-zhejiang-university) to https://api.wandb.ai. Use `wandb login --relogin` to force relogin
+wandb: Currently logged in as: mikastars (mikastars-zhejiang-university) to https://api.wandb.ai. Use `wandb login --relogin` to force relogin
+wandb: setting up run v6m8ctck
+wandb: setting up run 3kl3jf3g
+wandb: setting up run l45ioj2b
+wandb: setting up run naqjbub0
+wandb: Tracking run with wandb version 0.22.3
+wandb: Run data is saved locally in /mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/wandb/run-20251202_001233-naqjbub0
+wandb: Run `wandb offline` to turn off syncing.
+wandb: Syncing run outputs/dapo_lora_plus_20251202_001141
+wandb: ⭐️ View project at https://wandb.ai/mikastars-zhejiang-university/Tina
+wandb: 🚀 View run at https://wandb.ai/mikastars-zhejiang-university/Tina/runs/naqjbub0
+wandb: Tracking run with wandb version 0.22.3
+wandb: Run data is saved locally in /mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/wandb/run-20251202_001233-l45ioj2b
+wandb: Run `wandb offline` to turn off syncing.
+wandb: Syncing run outputs/dapo_lora_plus_20251202_001141
+wandb: ⭐️ View project at https://wandb.ai/mikastars-zhejiang-university/Tina
+wandb: 🚀 View run at https://wandb.ai/mikastars-zhejiang-university/Tina/runs/l45ioj2b
+wandb: Detected [huggingface_hub.inference, openai] in use.
+wandb: Use W&B Weave for improved LLM call tracing. Install Weave with `pip install weave` then add `import weave` to the top of your script.
+wandb: Detected [huggingface_hub.inference, openai] in use.
+wandb: For more information, check out the docs at: https://weave-docs.wandb.ai/
+wandb: Use W&B Weave for improved LLM call tracing. Install Weave with `pip install weave` then add `import weave` to the top of your script.
+[OpenTinker] 2025-12-02 00:12:35,516 - root - INFO - Wandb initialized successfully
+wandb: For more information, check out the docs at: https://weave-docs.wandb.ai/
+[OpenTinker] 2025-12-02 00:12:35,517 - root - INFO - Loading tokenizer from deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B
+[OpenTinker] 2025-12-02 00:12:35,517 - root - INFO - Wandb initialized successfully
+[OpenTinker] 2025-12-02 00:12:35,517 - root - INFO - Loading tokenizer from deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B
+wandb: Tracking run with wandb version 0.22.3
+wandb: Run data is saved locally in /mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/wandb/run-20251202_001233-3kl3jf3g
+wandb: Run `wandb offline` to turn off syncing.
+wandb: Syncing run outputs/dapo_lora_plus_20251202_001141
+wandb: ⭐️ View project at https://wandb.ai/mikastars-zhejiang-university/Tina
+wandb: 🚀 View run at https://wandb.ai/mikastars-zhejiang-university/Tina/runs/3kl3jf3g
+wandb: Tracking run with wandb version 0.22.3
+wandb: Run data is saved locally in /mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/wandb/run-20251202_001233-v6m8ctck
+wandb: Run `wandb offline` to turn off syncing.
+wandb: Syncing run outputs/dapo_lora_plus_20251202_001141
+wandb: ⭐️ View project at https://wandb.ai/mikastars-zhejiang-university/Tina
+wandb: 🚀 View run at https://wandb.ai/mikastars-zhejiang-university/Tina/runs/v6m8ctck
+wandb: Detected [huggingface_hub.inference, openai] in use.
+wandb: Use W&B Weave for improved LLM call tracing. Install Weave with `pip install weave` then add `import weave` to the top of your script.
+wandb: For more information, check out the docs at: https://weave-docs.wandb.ai/
+[OpenTinker] 2025-12-02 00:12:35,809 - root - INFO - Wandb initialized successfully
+[OpenTinker] 2025-12-02 00:12:35,809 - root - INFO - Loading tokenizer from deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B
+wandb: Detected [huggingface_hub.inference, openai] in use.
+wandb: Use W&B Weave for improved LLM call tracing. Install Weave with `pip install weave` then add `import weave` to the top of your script.
+wandb: For more information, check out the docs at: https://weave-docs.wandb.ai/
+[OpenTinker] 2025-12-02 00:12:35,981 - root - INFO - Wandb initialized successfully
+[OpenTinker] 2025-12-02 00:12:35,981 - root - INFO - Loading tokenizer from deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B
+[OpenTinker] 2025-12-02 00:12:36,844 - root - INFO - Loading dataset from open-r1/DAPO-Math-17k-Processed
+[OpenTinker] 2025-12-02 00:12:36,872 - root - INFO - Loading dataset from open-r1/DAPO-Math-17k-Processed
+[OpenTinker] 2025-12-02 00:12:37,011 - root - INFO - Loading dataset from open-r1/DAPO-Math-17k-Processed
+[OpenTinker] 2025-12-02 00:12:37,195 - root - INFO - Loading dataset from open-r1/DAPO-Math-17k-Processed
+[OpenTinker] 2025-12-02 00:12:40,047 - root - INFO - Loading model from deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B
+[OpenTinker] 2025-12-02 00:12:40,076 - root - INFO - Loading model from deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B
+[OpenTinker] 2025-12-02 00:12:40,102 - root - INFO - Loading model from deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B
+[OpenTinker] 2025-12-02 00:12:40,222 - root - INFO - Loading model from deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B
+`torch_dtype` is deprecated! Use `dtype` instead!
+`torch_dtype` is deprecated! Use `dtype` instead!
+`torch_dtype` is deprecated! Use `dtype` instead!
+`torch_dtype` is deprecated! Use `dtype` instead!
+[OpenTinker] 2025-12-02 00:12:41,300 - root - INFO - Model loaded successfully
+[OpenTinker] 2025-12-02 00:12:41,300 - root - INFO - Detected PEFT configuration, configuring lora
+[OpenTinker] 2025-12-02 00:12:41,324 - root - INFO - Model loaded successfully
+[OpenTinker] 2025-12-02 00:12:41,324 - root - INFO - Detected PEFT configuration, configuring lora
+[OpenTinker] 2025-12-02 00:12:41,467 - root - INFO - Model loaded successfully
+[OpenTinker] 2025-12-02 00:12:41,467 - root - INFO - Detected PEFT configuration, configuring lora
+[OpenTinker] 2025-12-02 00:12:41,475 - root - INFO - Model loaded successfully
+[OpenTinker] 2025-12-02 00:12:41,475 - root - INFO - Detected PEFT configuration, configuring lora
+[OpenTinker] 2025-12-02 00:12:41,729 - root - INFO - Lora configured successfully
+[OpenTinker] 2025-12-02 00:12:41,767 - root - INFO - Lora configured successfully
+[OpenTinker] 2025-12-02 00:12:41,831 - root - INFO - Lora configured successfully
+[OpenTinker] 2025-12-02 00:12:41,832 - root - INFO - Lora configured successfully
+[OpenTinker] 2025-12-02 00:12:42,231 - root - INFO - gcc -pthread -B /root/miniconda3/compiler_compat -DNDEBUG -fwrapv -O2 -Wall -fPIC -O2 -isystem /root/miniconda3/include -fPIC -O2 -isystem /root/miniconda3/include -fPIC -c /tmp/tmpye19kupf/test.c -o /tmp/tmpye19kupf/test.o
+[OpenTinker] 2025-12-02 00:12:42,231 - root - INFO - gcc -pthread -B /root/miniconda3/compiler_compat -DNDEBUG -fwrapv -O2 -Wall -fPIC -O2 -isystem /root/miniconda3/include -fPIC -O2 -isystem /root/miniconda3/include -fPIC -c /tmp/tmpkzj37tn0/test.c -o /tmp/tmpkzj37tn0/test.o
+[OpenTinker] 2025-12-02 00:12:42,246 - root - INFO - gcc -pthread -B /root/miniconda3/compiler_compat -DNDEBUG -fwrapv -O2 -Wall -fPIC -O2 -isystem /root/miniconda3/include -fPIC -O2 -isystem /root/miniconda3/include -fPIC -c /tmp/tmppq3_j2pr/test.c -o /tmp/tmppq3_j2pr/test.o
+[OpenTinker] 2025-12-02 00:12:42,246 - root - INFO - gcc -pthread -B /root/miniconda3/compiler_compat -DNDEBUG -fwrapv -O2 -Wall -fPIC -O2 -isystem /root/miniconda3/include -fPIC -O2 -isystem /root/miniconda3/include -fPIC -c /tmp/tmp1tdyxj41/test.c -o /tmp/tmp1tdyxj41/test.o
+[OpenTinker] 2025-12-02 00:12:42,261 - root - INFO - gcc -pthread -B /root/miniconda3/compiler_compat /tmp/tmpye19kupf/test.o -laio -o /tmp/tmpye19kupf/a.out
+[OpenTinker] 2025-12-02 00:12:42,275 - root - INFO - gcc -pthread -B /root/miniconda3/compiler_compat /tmp/tmpkzj37tn0/test.o -laio -o /tmp/tmpkzj37tn0/a.out
+[OpenTinker] 2025-12-02 00:12:42,290 - root - INFO - gcc -pthread -B /root/miniconda3/compiler_compat /tmp/tmppq3_j2pr/test.o -laio -o /tmp/tmppq3_j2pr/a.out
+[OpenTinker] 2025-12-02 00:12:42,305 - root - INFO - gcc -pthread -B /root/miniconda3/compiler_compat /tmp/tmp1tdyxj41/test.o -laio -o /tmp/tmp1tdyxj41/a.out
+[OpenTinker] 2025-12-02 00:12:42,789 - root - INFO - gcc -pthread -B /root/miniconda3/compiler_compat -DNDEBUG -fwrapv -O2 -Wall -fPIC -O2 -isystem /root/miniconda3/include -fPIC -O2 -isystem /root/miniconda3/include -fPIC -c /tmp/tmp4vltduv2/test.c -o /tmp/tmp4vltduv2/test.o
+[OpenTinker] 2025-12-02 00:12:42,802 - root - INFO - gcc -pthread -B /root/miniconda3/compiler_compat -DNDEBUG -fwrapv -O2 -Wall -fPIC -O2 -isystem /root/miniconda3/include -fPIC -O2 -isystem /root/miniconda3/include -fPIC -c /tmp/tmpgumyw8zr/test.c -o /tmp/tmpgumyw8zr/test.o
+[OpenTinker] 2025-12-02 00:12:42,803 - root - INFO - gcc -pthread -B /root/miniconda3/compiler_compat -DNDEBUG -fwrapv -O2 -Wall -fPIC -O2 -isystem /root/miniconda3/include -fPIC -O2 -isystem /root/miniconda3/include -fPIC -c /tmp/tmpeqggqm0g/test.c -o /tmp/tmpeqggqm0g/test.o
+[OpenTinker] 2025-12-02 00:12:42,830 - root - INFO - gcc -pthread -B /root/miniconda3/compiler_compat -DNDEBUG -fwrapv -O2 -Wall -fPIC -O2 -isystem /root/miniconda3/include -fPIC -O2 -isystem /root/miniconda3/include -fPIC -c /tmp/tmpz4mi1igy/test.c -o /tmp/tmpz4mi1igy/test.o
+[OpenTinker] 2025-12-02 00:12:42,846 - root - INFO - gcc -pthread -B /root/miniconda3/compiler_compat /tmp/tmp4vltduv2/test.o -L/usr/local/cuda -L/usr/local/cuda/lib64 -lcufile -o /tmp/tmp4vltduv2/a.out
+[OpenTinker] 2025-12-02 00:12:42,862 - root - INFO - gcc -pthread -B /root/miniconda3/compiler_compat /tmp/tmpeqggqm0g/test.o -L/usr/local/cuda -L/usr/local/cuda/lib64 -lcufile -o /tmp/tmpeqggqm0g/a.out
+[OpenTinker] 2025-12-02 00:12:42,875 - root - INFO - gcc -pthread -B /root/miniconda3/compiler_compat /tmp/tmpgumyw8zr/test.o -L/usr/local/cuda -L/usr/local/cuda/lib64 -lcufile -o /tmp/tmpgumyw8zr/a.out
+[OpenTinker] 2025-12-02 00:12:42,886 - root - INFO - gcc -pthread -B /root/miniconda3/compiler_compat /tmp/tmpz4mi1igy/test.o -L/usr/local/cuda -L/usr/local/cuda/lib64 -lcufile -o /tmp/tmpz4mi1igy/a.out
+lshn-qs-g2ri-2:96975:96975 [0] NCCL INFO NCCL_SOCKET_IFNAME set by environment to eth0
+lshn-qs-g2ri-2:96975:96975 [0] NCCL INFO Bootstrap: Using eth0:10.146.224.17<0>
+lshn-qs-g2ri-2:96975:96975 [0] NCCL INFO cudaDriverVersion 12090
+lshn-qs-g2ri-2:96975:96975 [0] NCCL INFO NCCL version 2.27.3+cuda12.9
+lshn-qs-g2ri-2:96975:96975 [0] NCCL INFO Comm config Blocking set to 1
+lshn-qs-g2ri-2:96977:96977 [2] NCCL INFO cudaDriverVersion 12090
+lshn-qs-g2ri-2:96977:96977 [2] NCCL INFO NCCL_SOCKET_IFNAME set by environment to eth0
+lshn-qs-g2ri-2:96976:96976 [1] NCCL INFO cudaDriverVersion 12090
+lshn-qs-g2ri-2:96976:96976 [1] NCCL INFO NCCL_SOCKET_IFNAME set by environment to eth0
+lshn-qs-g2ri-2:96977:96977 [2] NCCL INFO Bootstrap: Using eth0:10.146.224.17<0>
+lshn-qs-g2ri-2:96977:96977 [2] NCCL INFO NCCL version 2.27.3+cuda12.9
+lshn-qs-g2ri-2:96976:96976 [1] NCCL INFO Bootstrap: Using eth0:10.146.224.17<0>
+lshn-qs-g2ri-2:96976:96976 [1] NCCL INFO NCCL version 2.27.3+cuda12.9
+lshn-qs-g2ri-2:96977:96977 [2] NCCL INFO Comm config Blocking set to 1
+lshn-qs-g2ri-2:96976:96976 [1] NCCL INFO Comm config Blocking set to 1
+lshn-qs-g2ri-2:96978:96978 [3] NCCL INFO cudaDriverVersion 12090
+lshn-qs-g2ri-2:96978:96978 [3] NCCL INFO NCCL_SOCKET_IFNAME set by environment to eth0
+lshn-qs-g2ri-2:96978:96978 [3] NCCL INFO Bootstrap: Using eth0:10.146.224.17<0>
+lshn-qs-g2ri-2:96978:96978 [3] NCCL INFO NCCL version 2.27.3+cuda12.9
+lshn-qs-g2ri-2:96978:96978 [3] NCCL INFO Comm config Blocking set to 1
+lshn-qs-g2ri-2:96976:98053 [1] NCCL INFO NET/Plugin: Could not find: libnccl-net-none.so. 
+lshn-qs-g2ri-2:96976:98053 [1] NCCL INFO NCCL_IB_DISABLE set by environment to 1.
+lshn-qs-g2ri-2:96976:98053 [1] NCCL INFO NCCL_SOCKET_IFNAME set by environment to eth0
+lshn-qs-g2ri-2:96977:98054 [2] NCCL INFO NET/Plugin: Could not find: libnccl-net-none.so. 
+lshn-qs-g2ri-2:96977:98054 [2] NCCL INFO NCCL_IB_DISABLE set by environment to 1.
+lshn-qs-g2ri-2:96977:98054 [2] NCCL INFO NCCL_SOCKET_IFNAME set by environment to eth0
+lshn-qs-g2ri-2:96975:98052 [0] NCCL INFO NET/Plugin: Could not find: libnccl-net-none.so. 
+lshn-qs-g2ri-2:96975:98052 [0] NCCL INFO NCCL_IB_DISABLE set by environment to 1.
+lshn-qs-g2ri-2:96975:98052 [0] NCCL INFO NCCL_SOCKET_IFNAME set by environment to eth0
+lshn-qs-g2ri-2:96978:98055 [3] NCCL INFO NET/Plugin: Could not find: libnccl-net-none.so. 
+lshn-qs-g2ri-2:96978:98055 [3] NCCL INFO NCCL_IB_DISABLE set by environment to 1.
+lshn-qs-g2ri-2:96978:98055 [3] NCCL INFO NCCL_SOCKET_IFNAME set by environment to eth0
+lshn-qs-g2ri-2:96976:98053 [1] NCCL INFO NET/Socket : Using [0]eth0:10.146.224.17<0>
+lshn-qs-g2ri-2:96976:98053 [1] NCCL INFO Initialized NET plugin Socket
+lshn-qs-g2ri-2:96976:98053 [1] NCCL INFO Assigned NET plugin Socket to comm
+lshn-qs-g2ri-2:96976:98053 [1] NCCL INFO Using network Socket
+lshn-qs-g2ri-2:96978:98055 [3] NCCL INFO NET/Socket : Using [0]eth0:10.146.224.17<0>
+lshn-qs-g2ri-2:96978:98055 [3] NCCL INFO Initialized NET plugin Socket
+lshn-qs-g2ri-2:96975:98052 [0] NCCL INFO NET/Socket : Using [0]eth0:10.146.224.17<0>
+lshn-qs-g2ri-2:96975:98052 [0] NCCL INFO Initialized NET plugin Socket
+lshn-qs-g2ri-2:96977:98054 [2] NCCL INFO NET/Socket : Using [0]eth0:10.146.224.17<0>
+lshn-qs-g2ri-2:96977:98054 [2] NCCL INFO Initialized NET plugin Socket
+lshn-qs-g2ri-2:96978:98055 [3] NCCL INFO Assigned NET plugin Socket to comm
+lshn-qs-g2ri-2:96978:98055 [3] NCCL INFO Using network Socket
+lshn-qs-g2ri-2:96975:98052 [0] NCCL INFO Assigned NET plugin Socket to comm
+lshn-qs-g2ri-2:96975:98052 [0] NCCL INFO Using network Socket
+lshn-qs-g2ri-2:96977:98054 [2] NCCL INFO Assigned NET plugin Socket to comm
+lshn-qs-g2ri-2:96977:98054 [2] NCCL INFO Using network Socket
+lshn-qs-g2ri-2:96976:98053 [1] NCCL INFO ncclCommInitRankConfig comm 0x1dbf3b00 rank 1 nranks 4 cudaDev 1 nvmlDev 1 busId 7e000 commId 0x67adbc2d692fd1c8 - Init START
+lshn-qs-g2ri-2:96975:98052 [0] NCCL INFO ncclCommInitRankConfig comm 0x1df75b30 rank 0 nranks 4 cudaDev 0 nvmlDev 0 busId 8000 commId 0x67adbc2d692fd1c8 - Init START
+lshn-qs-g2ri-2:96978:98055 [3] NCCL INFO ncclCommInitRankConfig comm 0x1e24ba00 rank 3 nranks 4 cudaDev 3 nvmlDev 3 busId c6000 commId 0x67adbc2d692fd1c8 - Init START
+lshn-qs-g2ri-2:96975:98052 [0] NCCL INFO RAS client listening socket at ::1<28028>
+lshn-qs-g2ri-2:96977:98054 [2] NCCL INFO ncclCommInitRankConfig comm 0x1f1d4810 rank 2 nranks 4 cudaDev 2 nvmlDev 2 busId a2000 commId 0x67adbc2d692fd1c8 - Init START
+lshn-qs-g2ri-2:96976:98053 [1] NCCL INFO RAS client listening socket at ::1<28028>
+lshn-qs-g2ri-2:96978:98055 [3] NCCL INFO RAS client listening socket at ::1<28028>
+lshn-qs-g2ri-2:96977:98054 [2] NCCL INFO RAS client listening socket at ::1<28028>
+lshn-qs-g2ri-2:96976:98053 [1] NCCL INFO Bootstrap timings total 0.002802 (create 0.000019, send 0.000086, recv 0.002229, ring 0.000169, delay 0.000000)
+lshn-qs-g2ri-2:96975:98052 [0] NCCL INFO Bootstrap timings total 0.002651 (create 0.000020, send 0.000086, recv 0.000098, ring 0.001236, delay 0.000001)
+lshn-qs-g2ri-2:96978:98055 [3] NCCL INFO Bootstrap timings total 0.001716 (create 0.000020, send 0.000107, recv 0.000044, ring 0.000116, delay 0.000000)
+lshn-qs-g2ri-2:96977:98054 [2] NCCL INFO Bootstrap timings total 0.000717 (create 0.000021, send 0.000095, recv 0.000166, ring 0.000061, delay 0.000001)
+lshn-qs-g2ri-2:96978:98055 [3] NCCL INFO MNNVL busId 0xc6000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0
+lshn-qs-g2ri-2:96976:98053 [1] NCCL INFO MNNVL busId 0x7e000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0
+lshn-qs-g2ri-2:96975:98052 [0] NCCL INFO MNNVL busId 0x8000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0
+lshn-qs-g2ri-2:96977:98054 [2] NCCL INFO MNNVL busId 0xa2000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0
+lshn-qs-g2ri-2:96976:98053 [1] NCCL INFO Setting affinity for GPU 1 to 0-47,96-143
+lshn-qs-g2ri-2:96976:98053 [1] NCCL INFO NCCL_NVLS_ENABLE set by environment to 0.
+lshn-qs-g2ri-2:96977:98054 [2] NCCL INFO Setting affinity for GPU 2 to 0-47,96-143
+lshn-qs-g2ri-2:96975:98052 [0] NCCL INFO Setting affinity for GPU 0 to 0-47,96-143
+lshn-qs-g2ri-2:96977:98054 [2] NCCL INFO NCCL_NVLS_ENABLE set by environment to 0.
+lshn-qs-g2ri-2:96975:98052 [0] NCCL INFO NCCL_NVLS_ENABLE set by environment to 0.
+lshn-qs-g2ri-2:96978:98055 [3] NCCL INFO Setting affinity for GPU 3 to 0-47,96-143
+lshn-qs-g2ri-2:96978:98055 [3] NCCL INFO NCCL_NVLS_ENABLE set by environment to 0.
+lshn-qs-g2ri-2:96978:98055 [3] NCCL INFO comm 0x1e24ba00 rank 3 nRanks 4 nNodes 1 localRanks 4 localRank 3 MNNVL 0
+lshn-qs-g2ri-2:96975:98052 [0] NCCL INFO comm 0x1df75b30 rank 0 nRanks 4 nNodes 1 localRanks 4 localRank 0 MNNVL 0
+lshn-qs-g2ri-2:96977:98054 [2] NCCL INFO comm 0x1f1d4810 rank 2 nRanks 4 nNodes 1 localRanks 4 localRank 2 MNNVL 0
+lshn-qs-g2ri-2:96976:98053 [1] NCCL INFO comm 0x1dbf3b00 rank 1 nRanks 4 nNodes 1 localRanks 4 localRank 1 MNNVL 0
+lshn-qs-g2ri-2:96978:98055 [3] NCCL INFO NCCL_MIN_NCHANNELS set by environment to 4.
+lshn-qs-g2ri-2:96975:98052 [0] NCCL INFO NCCL_MIN_NCHANNELS set by environment to 4.
+lshn-qs-g2ri-2:96977:98054 [2] NCCL INFO NCCL_MIN_NCHANNELS set by environment to 4.
+lshn-qs-g2ri-2:96976:98053 [1] NCCL INFO NCCL_MIN_NCHANNELS set by environment to 4.
+lshn-qs-g2ri-2:96978:98055 [3] NCCL INFO Trees [0] -1/-1/-1->3->2 [1] -1/-1/-1->3->2 [2] -1/-1/-1->3->2 [3] -1/-1/-1->3->2 [4] -1/-1/-1->3->2 [5] -1/-1/-1->3->2 [6] -1/-1/-1->3->2 [7] -1/-1/-1->3->2 [8] -1/-1/-1->3->2 [9] -1/-1/-1->3->2 [10] -1/-1/-1->3->2 [11] -1/-1/-1->3->2 [12] -1/-1/-1->3->2 [13] -1/-1/-1->3->2 [14] -1/-1/-1->3->2 [15] -1/-1/-1->3->2 [16] -1/-1/-1->3->2 [17] -1/-1/-1->3->2 [18] -1/-1/-1->3->2 [19] -1/-1/-1->3->2 [20] -1/-1/-1->3->2 [21] -1/-1/-1->3->2 [22] -1/-1/-1->3->2 [23] -1/-1/-1->3->2
+lshn-qs-g2ri-2:96976:98053 [1] NCCL INFO Trees [0] 2/-1/-1->1->0 [1] 2/-1/-1->1->0 [2] 2/-1/-1->1->0 [3] 2/-1/-1->1->0 [4] 2/-1/-1->1->0 [5] 2/-1/-1->1->0 [6] 2/-1/-1->1->0 [7] 2/-1/-1->1->0 [8] 2/-1/-1->1->0 [9] 2/-1/-1->1->0 [10] 2/-1/-1->1->0 [11] 2/-1/-1->1->0 [12] 2/-1/-1->1->0 [13] 2/-1/-1->1->0 [14] 2/-1/-1->1->0 [15] 2/-1/-1->1->0 [16] 2/-1/-1->1->0 [17] 2/-1/-1->1->0 [18] 2/-1/-1->1->0 [19] 2/-1/-1->1->0 [20] 2/-1/-1->1->0 [21] 2/-1/-1->1->0 [22] 2/-1/-1->1->0 [23] 2/-1/-1->1->0
+lshn-qs-g2ri-2:96975:98052 [0] NCCL INFO Channel 00/24 : 0 1 2 3
+lshn-qs-g2ri-2:96977:98054 [2] NCCL INFO Trees [0] 3/-1/-1->2->1 [1] 3/-1/-1->2->1 [2] 3/-1/-1->2->1 [3] 3/-1/-1->2->1 [4] 3/-1/-1->2->1 [5] 3/-1/-1->2->1 [6] 3/-1/-1->2->1 [7] 3/-1/-1->2->1 [8] 3/-1/-1->2->1 [9] 3/-1/-1->2->1 [10] 3/-1/-1->2->1 [11] 3/-1/-1->2->1 [12] 3/-1/-1->2->1 [13] 3/-1/-1->2->1 [14] 3/-1/-1->2->1 [15] 3/-1/-1->2->1 [16] 3/-1/-1->2->1 [17] 3/-1/-1->2->1 [18] 3/-1/-1->2->1 [19] 3/-1/-1->2->1 [20] 3/-1/-1->2->1 [21] 3/-1/-1->2->1 [22] 3/-1/-1->2->1 [23] 3/-1/-1->2->1
+lshn-qs-g2ri-2:96978:98055 [3] NCCL INFO P2P Chunksize set to 524288
+lshn-qs-g2ri-2:96976:98053 [1] NCCL INFO P2P Chunksize set to 524288
+lshn-qs-g2ri-2:96975:98052 [0] NCCL INFO Channel 01/24 : 0 1 2 3
+lshn-qs-g2ri-2:96977:98054 [2] NCCL INFO P2P Chunksize set to 524288
+lshn-qs-g2ri-2:96975:98052 [0] NCCL INFO Channel 02/24 : 0 1 2 3
+lshn-qs-g2ri-2:96975:98052 [0] NCCL INFO Channel 03/24 : 0 1 2 3
+lshn-qs-g2ri-2:96975:98052 [0] NCCL INFO Channel 04/24 : 0 1 2 3
+lshn-qs-g2ri-2:96975:98052 [0] NCCL INFO Channel 05/24 : 0 1 2 3
+lshn-qs-g2ri-2:96975:98052 [0] NCCL INFO Channel 06/24 : 0 1 2 3
+lshn-qs-g2ri-2:96975:98052 [0] NCCL INFO Channel 07/24 : 0 1 2 3
+lshn-qs-g2ri-2:96975:98052 [0] NCCL INFO Channel 08/24 : 0 1 2 3
+lshn-qs-g2ri-2:96975:98052 [0] NCCL INFO Channel 09/24 : 0 1 2 3
+lshn-qs-g2ri-2:96975:98052 [0] NCCL INFO Channel 10/24 : 0 1 2 3
+lshn-qs-g2ri-2:96975:98052 [0] NCCL INFO Channel 11/24 : 0 1 2 3
+lshn-qs-g2ri-2:96975:98052 [0] NCCL INFO Channel 12/24 : 0 1 2 3
+lshn-qs-g2ri-2:96975:98052 [0] NCCL INFO Channel 13/24 : 0 1 2 3
+lshn-qs-g2ri-2:96975:98052 [0] NCCL INFO Channel 14/24 : 0 1 2 3
+lshn-qs-g2ri-2:96975:98052 [0] NCCL INFO Channel 15/24 : 0 1 2 3
+lshn-qs-g2ri-2:96975:98052 [0] NCCL INFO Channel 16/24 : 0 1 2 3
+lshn-qs-g2ri-2:96975:98052 [0] NCCL INFO Channel 17/24 : 0 1 2 3
+lshn-qs-g2ri-2:96975:98052 [0] NCCL INFO Channel 18/24 : 0 1 2 3
+lshn-qs-g2ri-2:96975:98052 [0] NCCL INFO Channel 19/24 : 0 1 2 3
+lshn-qs-g2ri-2:96975:98052 [0] NCCL INFO Channel 20/24 : 0 1 2 3
+lshn-qs-g2ri-2:96975:98052 [0] NCCL INFO Channel 21/24 : 0 1 2 3
+lshn-qs-g2ri-2:96975:98052 [0] NCCL INFO Channel 22/24 : 0 1 2 3
+lshn-qs-g2ri-2:96975:98052 [0] NCCL INFO Channel 23/24 : 0 1 2 3
+lshn-qs-g2ri-2:96975:98052 [0] NCCL INFO Trees [0] 1/-1/-1->0->-1 [1] 1/-1/-1->0->-1 [2] 1/-1/-1->0->-1 [3] 1/-1/-1->0->-1 [4] 1/-1/-1->0->-1 [5] 1/-1/-1->0->-1 [6] 1/-1/-1->0->-1 [7] 1/-1/-1->0->-1 [8] 1/-1/-1->0->-1 [9] 1/-1/-1->0->-1 [10] 1/-1/-1->0->-1 [11] 1/-1/-1->0->-1 [12] 1/-1/-1->0->-1 [13] 1/-1/-1->0->-1 [14] 1/-1/-1->0->-1 [15] 1/-1/-1->0->-1 [16] 1/-1/-1->0->-1 [17] 1/-1/-1->0->-1 [18] 1/-1/-1->0->-1 [19] 1/-1/-1->0->-1 [20] 1/-1/-1->0->-1 [21] 1/-1/-1->0->-1 [22] 1/-1/-1->0->-1 [23] 1/-1/-1->0->-1
+lshn-qs-g2ri-2:96975:98052 [0] NCCL INFO P2P Chunksize set to 524288
+lshn-qs-g2ri-2:96975:98052 [0] NCCL INFO PROFILER/Plugin: Could not find: libnccl-profiler.so. 
+lshn-qs-g2ri-2:96975:98052 [0] NCCL INFO Check P2P Type isAllDirectP2p 1 directMode 0
+lshn-qs-g2ri-2:96975:98060 [0] NCCL INFO [Proxy Service] Device 0 CPU core 21
+lshn-qs-g2ri-2:96975:98061 [0] NCCL INFO [Proxy Service UDS] Device 0 CPU core 24
+lshn-qs-g2ri-2:96978:98055 [3] NCCL INFO PROFILER/Plugin: Could not find: libnccl-profiler.so. 
+lshn-qs-g2ri-2:96977:98054 [2] NCCL INFO PROFILER/Plugin: Could not find: libnccl-profiler.so. 
+lshn-qs-g2ri-2:96978:98062 [3] NCCL INFO [Proxy Service] Device 3 CPU core 6
+lshn-qs-g2ri-2:96978:98064 [3] NCCL INFO [Proxy Service UDS] Device 3 CPU core 10
+lshn-qs-g2ri-2:96976:98053 [1] NCCL INFO PROFILER/Plugin: Could not find: libnccl-profiler.so. 
+lshn-qs-g2ri-2:96977:98063 [2] NCCL INFO [Proxy Service] Device 2 CPU core 28
+lshn-qs-g2ri-2:96977:98065 [2] NCCL INFO [Proxy Service UDS] Device 2 CPU core 128
+lshn-qs-g2ri-2:96976:98066 [1] NCCL INFO [Proxy Service] Device 1 CPU core 11
+lshn-qs-g2ri-2:96976:98067 [1] NCCL INFO [Proxy Service UDS] Device 1 CPU core 112
+lshn-qs-g2ri-2:96978:98055 [3] NCCL INFO threadThresholds 8/8/64 | 32/8/64 | 512 | 512
+lshn-qs-g2ri-2:96978:98055 [3] NCCL INFO 24 coll channels, 24 collnet channels, 0 nvls channels, 32 p2p channels, 32 p2p channels per peer
+lshn-qs-g2ri-2:96977:98054 [2] NCCL INFO threadThresholds 8/8/64 | 32/8/64 | 512 | 512
+lshn-qs-g2ri-2:96977:98054 [2] NCCL INFO 24 coll channels, 24 collnet channels, 0 nvls channels, 32 p2p channels, 32 p2p channels per peer
+lshn-qs-g2ri-2:96975:98052 [0] NCCL INFO threadThresholds 8/8/64 | 32/8/64 | 512 | 512
+lshn-qs-g2ri-2:96975:98052 [0] NCCL INFO 24 coll channels, 24 collnet channels, 0 nvls channels, 32 p2p channels, 32 p2p channels per peer
+lshn-qs-g2ri-2:96976:98053 [1] NCCL INFO threadThresholds 8/8/64 | 32/8/64 | 512 | 512
+lshn-qs-g2ri-2:96976:98053 [1] NCCL INFO 24 coll channels, 24 collnet channels, 0 nvls channels, 32 p2p channels, 32 p2p channels per peer
+lshn-qs-g2ri-2:96975:98052 [0] NCCL INFO CC Off, workFifoBytes 1048576
+lshn-qs-g2ri-2:96978:98055 [3] NCCL INFO TUNER/Plugin: Could not find: libnccl-tuner.so. Using internal tuner plugin.
+lshn-qs-g2ri-2:96978:98055 [3] NCCL INFO ncclCommInitRankConfig comm 0x1e24ba00 rank 3 nranks 4 cudaDev 3 nvmlDev 3 busId c6000 commId 0x67adbc2d692fd1c8 - Init COMPLETE
+lshn-qs-g2ri-2:96978:98055 [3] NCCL INFO Init timings - ncclCommInitRankConfig: rank 3 nranks 4 total 1.02 (kernels 0.15, alloc 0.69, bootstrap 0.00, allgathers 0.01, topo 0.02, graphs 0.01, connections 0.05, rest 0.09)
+lshn-qs-g2ri-2:96977:98054 [2] NCCL INFO TUNER/Plugin: Could not find: libnccl-tuner.so. Using internal tuner plugin.
+lshn-qs-g2ri-2:96977:98054 [2] NCCL INFO ncclCommInitRankConfig comm 0x1f1d4810 rank 2 nranks 4 cudaDev 2 nvmlDev 2 busId a2000 commId 0x67adbc2d692fd1c8 - Init COMPLETE
+lshn-qs-g2ri-2:96977:98054 [2] NCCL INFO Init timings - ncclCommInitRankConfig: rank 2 nranks 4 total 1.03 (kernels 0.16, alloc 0.69, bootstrap 0.00, allgathers 0.01, topo 0.02, graphs 0.01, connections 0.05, rest 0.09)
+lshn-qs-g2ri-2:96975:98052 [0] NCCL INFO TUNER/Plugin: Could not find: libnccl-tuner.so. Using internal tuner plugin.
+lshn-qs-g2ri-2:96975:98052 [0] NCCL INFO ncclCommInitRankConfig comm 0x1df75b30 rank 0 nranks 4 cudaDev 0 nvmlDev 0 busId 8000 commId 0x67adbc2d692fd1c8 - Init COMPLETE
+lshn-qs-g2ri-2:96975:98052 [0] NCCL INFO Init timings - ncclCommInitRankConfig: rank 0 nranks 4 total 1.03 (kernels 0.16, alloc 0.69, bootstrap 0.00, allgathers 0.02, topo 0.02, graphs 0.01, connections 0.05, rest 0.08)
+lshn-qs-g2ri-2:96976:98053 [1] NCCL INFO TUNER/Plugin: Could not find: libnccl-tuner.so. Using internal tuner plugin.
+lshn-qs-g2ri-2:96976:98053 [1] NCCL INFO ncclCommInitRankConfig comm 0x1dbf3b00 rank 1 nranks 4 cudaDev 1 nvmlDev 1 busId 7e000 commId 0x67adbc2d692fd1c8 - Init COMPLETE
+lshn-qs-g2ri-2:96976:98053 [1] NCCL INFO Init timings - ncclCommInitRankConfig: rank 1 nranks 4 total 1.03 (kernels 0.16, alloc 0.69, bootstrap 0.00, allgathers 0.02, topo 0.02, graphs 0.01, connections 0.05, rest 0.09)
+[OpenTinker] 2025-12-02 00:12:47,140 - root - INFO - Training model with GRPO
+[OpenTinker] 2025-12-02 00:12:47,141 - root - INFO - Training model with GRPO
+[OpenTinker] 2025-12-02 00:12:47,229 - root - INFO - Training model with GRPO
+[OpenTinker] 2025-12-02 00:12:47,229 - root - INFO - Training model with GRPO
+INFO 12-02 00:12:47 [utils.py:328] non-default args: {'seed': 1, 'max_model_len': 16896, 'distributed_executor_backend': 'external_launcher', 'gpu_memory_utilization': 0.4, 'max_num_batched_tokens': 4096, 'max_num_seqs': 32, 'logprobs_mode': 'processed_logprobs', 'disable_log_stats': True, 'model_impl': 'vllm', 'model': 'deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B'}
+INFO 12-02 00:12:47 [utils.py:328] non-default args: {'seed': 3, 'max_model_len': 16896, 'distributed_executor_backend': 'external_launcher', 'gpu_memory_utilization': 0.4, 'max_num_batched_tokens': 4096, 'max_num_seqs': 32, 'logprobs_mode': 'processed_logprobs', 'disable_log_stats': True, 'model_impl': 'vllm', 'model': 'deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B'}
+INFO 12-02 00:12:47 [utils.py:328] non-default args: {'seed': 0, 'max_model_len': 16896, 'distributed_executor_backend': 'external_launcher', 'gpu_memory_utilization': 0.4, 'max_num_batched_tokens': 4096, 'max_num_seqs': 32, 'logprobs_mode': 'processed_logprobs', 'disable_log_stats': True, 'model_impl': 'vllm', 'model': 'deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B'}
+INFO 12-02 00:12:47 [utils.py:328] non-default args: {'seed': 2, 'max_model_len': 16896, 'distributed_executor_backend': 'external_launcher', 'gpu_memory_utilization': 0.4, 'max_num_batched_tokens': 4096, 'max_num_seqs': 32, 'logprobs_mode': 'processed_logprobs', 'disable_log_stats': True, 'model_impl': 'vllm', 'model': 'deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B'}
+INFO 12-02 00:13:05 [__init__.py:742] Resolved architecture: Qwen2ForCausalLM
+INFO 12-02 00:13:05 [__init__.py:1815] Using max model len 16896
+INFO 12-02 00:13:05 [__init__.py:742] Resolved architecture: Qwen2ForCausalLM
+INFO 12-02 00:13:05 [__init__.py:1815] Using max model len 16896
+INFO 12-02 00:13:05 [__init__.py:742] Resolved architecture: Qwen2ForCausalLM
+INFO 12-02 00:13:05 [__init__.py:1815] Using max model len 16896
+INFO 12-02 00:13:05 [__init__.py:742] Resolved architecture: Qwen2ForCausalLM
+INFO 12-02 00:13:05 [__init__.py:1815] Using max model len 16896
+INFO 12-02 00:13:06 [parallel.py:348] Disabling V1 multiprocessing for external launcher.
+INFO 12-02 00:13:06 [parallel.py:348] Disabling V1 multiprocessing for external launcher.
+INFO 12-02 00:13:06 [parallel.py:348] Disabling V1 multiprocessing for external launcher.
+INFO 12-02 00:13:06 [parallel.py:348] Disabling V1 multiprocessing for external launcher.
+INFO 12-02 00:13:07 [scheduler.py:222] Chunked prefill is enabled with max_num_batched_tokens=4096.
+INFO 12-02 00:13:07 [scheduler.py:222] Chunked prefill is enabled with max_num_batched_tokens=4096.
+INFO 12-02 00:13:07 [scheduler.py:222] Chunked prefill is enabled with max_num_batched_tokens=4096.
+INFO 12-02 00:13:07 [scheduler.py:222] Chunked prefill is enabled with max_num_batched_tokens=4096.
+INFO 12-02 00:13:08 [core.py:76] Initializing a V1 LLM engine (v0.10.2) with config: model='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', speculative_config=None, tokenizer='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=16896, download_dir=None, load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, data_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, device_config=cuda, decoding_config=DecodingConfig(backend='auto', disable_fallback=False, disable_any_whitespace=False, disable_additional_properties=False, reasoning_backend=''), observability_config=ObservabilityConfig(show_hidden_metrics_for_version=None, otlp_traces_endpoint=None, collect_detailed_traces=None), seed=0, served_model_name=deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B, enable_prefix_caching=True, chunked_prefill_enabled=True, use_async_output_proc=True, pooler_config=None, compilation_config={"level":3,"debug_dump_path":"","cache_dir":"","backend":"","custom_ops":[],"splitting_ops":["vllm.unified_attention","vllm.unified_attention_with_output","vllm.mamba_mixer2","vllm.mamba_mixer","vllm.short_conv","vllm.linear_attention","vllm.plamo2_mamba_mixer","vllm.gdn_attention"],"use_inductor":true,"compile_sizes":[],"inductor_compile_config":{"enable_auto_functionalized_v2":false},"inductor_passes":{},"cudagraph_mode":1,"use_cudagraph":true,"cudagraph_num_of_warmups":1,"cudagraph_capture_sizes":[64,56,48,40,32,24,16,8,4,2,1],"cudagraph_copy_inputs":false,"full_cuda_graph":false,"pass_config":{},"max_capture_size":64,"local_cache_dir":null}
+INFO 12-02 00:13:08 [core.py:76] Initializing a V1 LLM engine (v0.10.2) with config: model='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', speculative_config=None, tokenizer='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=16896, download_dir=None, load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, data_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, device_config=cuda, decoding_config=DecodingConfig(backend='auto', disable_fallback=False, disable_any_whitespace=False, disable_additional_properties=False, reasoning_backend=''), observability_config=ObservabilityConfig(show_hidden_metrics_for_version=None, otlp_traces_endpoint=None, collect_detailed_traces=None), seed=3, served_model_name=deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B, enable_prefix_caching=True, chunked_prefill_enabled=True, use_async_output_proc=True, pooler_config=None, compilation_config={"level":3,"debug_dump_path":"","cache_dir":"","backend":"","custom_ops":[],"splitting_ops":["vllm.unified_attention","vllm.unified_attention_with_output","vllm.mamba_mixer2","vllm.mamba_mixer","vllm.short_conv","vllm.linear_attention","vllm.plamo2_mamba_mixer","vllm.gdn_attention"],"use_inductor":true,"compile_sizes":[],"inductor_compile_config":{"enable_auto_functionalized_v2":false},"inductor_passes":{},"cudagraph_mode":1,"use_cudagraph":true,"cudagraph_num_of_warmups":1,"cudagraph_capture_sizes":[64,56,48,40,32,24,16,8,4,2,1],"cudagraph_copy_inputs":false,"full_cuda_graph":false,"pass_config":{},"max_capture_size":64,"local_cache_dir":null}
+INFO 12-02 00:13:08 [core.py:76] Initializing a V1 LLM engine (v0.10.2) with config: model='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', speculative_config=None, tokenizer='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=16896, download_dir=None, load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, data_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, device_config=cuda, decoding_config=DecodingConfig(backend='auto', disable_fallback=False, disable_any_whitespace=False, disable_additional_properties=False, reasoning_backend=''), observability_config=ObservabilityConfig(show_hidden_metrics_for_version=None, otlp_traces_endpoint=None, collect_detailed_traces=None), seed=2, served_model_name=deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B, enable_prefix_caching=True, chunked_prefill_enabled=True, use_async_output_proc=True, pooler_config=None, compilation_config={"level":3,"debug_dump_path":"","cache_dir":"","backend":"","custom_ops":[],"splitting_ops":["vllm.unified_attention","vllm.unified_attention_with_output","vllm.mamba_mixer2","vllm.mamba_mixer","vllm.short_conv","vllm.linear_attention","vllm.plamo2_mamba_mixer","vllm.gdn_attention"],"use_inductor":true,"compile_sizes":[],"inductor_compile_config":{"enable_auto_functionalized_v2":false},"inductor_passes":{},"cudagraph_mode":1,"use_cudagraph":true,"cudagraph_num_of_warmups":1,"cudagraph_capture_sizes":[64,56,48,40,32,24,16,8,4,2,1],"cudagraph_copy_inputs":false,"full_cuda_graph":false,"pass_config":{},"max_capture_size":64,"local_cache_dir":null}
+INFO 12-02 00:13:08 [core.py:76] Initializing a V1 LLM engine (v0.10.2) with config: model='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', speculative_config=None, tokenizer='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=16896, download_dir=None, load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, data_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, device_config=cuda, decoding_config=DecodingConfig(backend='auto', disable_fallback=False, disable_any_whitespace=False, disable_additional_properties=False, reasoning_backend=''), observability_config=ObservabilityConfig(show_hidden_metrics_for_version=None, otlp_traces_endpoint=None, collect_detailed_traces=None), seed=1, served_model_name=deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B, enable_prefix_caching=True, chunked_prefill_enabled=True, use_async_output_proc=True, pooler_config=None, compilation_config={"level":3,"debug_dump_path":"","cache_dir":"","backend":"","custom_ops":[],"splitting_ops":["vllm.unified_attention","vllm.unified_attention_with_output","vllm.mamba_mixer2","vllm.mamba_mixer","vllm.short_conv","vllm.linear_attention","vllm.plamo2_mamba_mixer","vllm.gdn_attention"],"use_inductor":true,"compile_sizes":[],"inductor_compile_config":{"enable_auto_functionalized_v2":false},"inductor_passes":{},"cudagraph_mode":1,"use_cudagraph":true,"cudagraph_num_of_warmups":1,"cudagraph_capture_sizes":[64,56,48,40,32,24,16,8,4,2,1],"cudagraph_copy_inputs":false,"full_cuda_graph":false,"pass_config":{},"max_capture_size":64,"local_cache_dir":null}
+[rank2]:[W1202 00:13:10.314419448 ProcessGroupNCCL.cpp:981] Warning: TORCH_NCCL_AVOID_RECORD_STREAMS is the default now, this environment variable is thus deprecated. (function operator())
+lshn-qs-g2ri-2:96977:96977 [2] NCCL INFO Comm config Blocking set to 1
+[rank3]:[W1202 00:13:10.398868254 ProcessGroupNCCL.cpp:981] Warning: TORCH_NCCL_AVOID_RECORD_STREAMS is the default now, this environment variable is thus deprecated. (function operator())
+lshn-qs-g2ri-2:96978:96978 [3] NCCL INFO Comm config Blocking set to 1
+[rank1]:[W1202 00:13:10.429521514 ProcessGroupNCCL.cpp:981] Warning: TORCH_NCCL_AVOID_RECORD_STREAMS is the default now, this environment variable is thus deprecated. (function operator())
+lshn-qs-g2ri-2:96976:96976 [1] NCCL INFO Comm config Blocking set to 1
+[rank0]:[W1202 00:13:10.622553720 ProcessGroupNCCL.cpp:981] Warning: TORCH_NCCL_AVOID_RECORD_STREAMS is the default now, this environment variable is thus deprecated. (function operator())
+lshn-qs-g2ri-2:96975:96975 [0] NCCL INFO Comm config Blocking set to 1
+lshn-qs-g2ri-2:96978:98198 [3] NCCL INFO Assigned NET plugin Socket to comm
+lshn-qs-g2ri-2:96978:98198 [3] NCCL INFO Using network Socket
+lshn-qs-g2ri-2:96976:98200 [1] NCCL INFO Assigned NET plugin Socket to comm
+lshn-qs-g2ri-2:96976:98200 [1] NCCL INFO Using network Socket
+lshn-qs-g2ri-2:96975:98204 [0] NCCL INFO Assigned NET plugin Socket to comm
+lshn-qs-g2ri-2:96975:98204 [0] NCCL INFO Using network Socket
+lshn-qs-g2ri-2:96977:98195 [2] NCCL INFO Assigned NET plugin Socket to comm
+lshn-qs-g2ri-2:96977:98195 [2] NCCL INFO Using network Socket
+lshn-qs-g2ri-2:96978:98198 [3] NCCL INFO ncclCommSplit comm 0x1f2f9af0 rank 3 nranks 4 cudaDev 3 nvmlDev 3 busId c6000 parent 0x1e24ba00 splitCount 1 color 2003953581 key 3- Init START
+lshn-qs-g2ri-2:96976:98200 [1] NCCL INFO ncclCommSplit comm 0x1eca0630 rank 1 nranks 4 cudaDev 1 nvmlDev 1 busId 7e000 parent 0x1dbf3b00 splitCount 1 color 2003953581 key 1- Init START
+lshn-qs-g2ri-2:96977:98195 [2] NCCL INFO ncclCommSplit comm 0x20310430 rank 2 nranks 4 cudaDev 2 nvmlDev 2 busId a2000 parent 0x1f1d4810 splitCount 1 color 2003953581 key 2- Init START
+lshn-qs-g2ri-2:96975:98204 [0] NCCL INFO ncclCommSplit comm 0x1f5ed7a0 rank 0 nranks 4 cudaDev 0 nvmlDev 0 busId 8000 parent 0x1df75b30 splitCount 1 color 2003953581 key 0- Init START
+lshn-qs-g2ri-2:96978:98198 [3] NCCL INFO MNNVL busId 0xc6000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0
+lshn-qs-g2ri-2:96975:98204 [0] NCCL INFO MNNVL busId 0x8000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0
+lshn-qs-g2ri-2:96976:98200 [1] NCCL INFO MNNVL busId 0x7e000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0
+lshn-qs-g2ri-2:96977:98195 [2] NCCL INFO MNNVL busId 0xa2000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0
+lshn-qs-g2ri-2:96976:98200 [1] NCCL INFO Setting affinity for GPU 1 to 0-47,96-143
+lshn-qs-g2ri-2:96975:98204 [0] NCCL INFO Setting affinity for GPU 0 to 0-47,96-143
+lshn-qs-g2ri-2:96978:98198 [3] NCCL INFO Setting affinity for GPU 3 to 0-47,96-143
+lshn-qs-g2ri-2:96977:98195 [2] NCCL INFO Setting affinity for GPU 2 to 0-47,96-143
+lshn-qs-g2ri-2:96976:98200 [1] NCCL INFO comm 0x1eca0630 rank 1 nRanks 4 nNodes 1 localRanks 4 localRank 1 MNNVL 0
+lshn-qs-g2ri-2:96978:98198 [3] NCCL INFO comm 0x1f2f9af0 rank 3 nRanks 4 nNodes 1 localRanks 4 localRank 3 MNNVL 0
+lshn-qs-g2ri-2:96977:98195 [2] NCCL INFO comm 0x20310430 rank 2 nRanks 4 nNodes 1 localRanks 4 localRank 2 MNNVL 0
+lshn-qs-g2ri-2:96975:98204 [0] NCCL INFO comm 0x1f5ed7a0 rank 0 nRanks 4 nNodes 1 localRanks 4 localRank 0 MNNVL 0
+lshn-qs-g2ri-2:96976:98200 [1] NCCL INFO Trees [0] 2/-1/-1->1->0 [1] 2/-1/-1->1->0 [2] 2/-1/-1->1->0 [3] 2/-1/-1->1->0 [4] 2/-1/-1->1->0 [5] 2/-1/-1->1->0 [6] 2/-1/-1->1->0 [7] 2/-1/-1->1->0 [8] 2/-1/-1->1->0 [9] 2/-1/-1->1->0 [10] 2/-1/-1->1->0 [11] 2/-1/-1->1->0 [12] 2/-1/-1->1->0 [13] 2/-1/-1->1->0 [14] 2/-1/-1->1->0 [15] 2/-1/-1->1->0 [16] 2/-1/-1->1->0 [17] 2/-1/-1->1->0 [18] 2/-1/-1->1->0 [19] 2/-1/-1->1->0 [20] 2/-1/-1->1->0 [21] 2/-1/-1->1->0 [22] 2/-1/-1->1->0 [23] 2/-1/-1->1->0
+lshn-qs-g2ri-2:96976:98200 [1] NCCL INFO P2P Chunksize set to 524288
+lshn-qs-g2ri-2:96978:98198 [3] NCCL INFO Trees [0] -1/-1/-1->3->2 [1] -1/-1/-1->3->2 [2] -1/-1/-1->3->2 [3] -1/-1/-1->3->2 [4] -1/-1/-1->3->2 [5] -1/-1/-1->3->2 [6] -1/-1/-1->3->2 [7] -1/-1/-1->3->2 [8] -1/-1/-1->3->2 [9] -1/-1/-1->3->2 [10] -1/-1/-1->3->2 [11] -1/-1/-1->3->2 [12] -1/-1/-1->3->2 [13] -1/-1/-1->3->2 [14] -1/-1/-1->3->2 [15] -1/-1/-1->3->2 [16] -1/-1/-1->3->2 [17] -1/-1/-1->3->2 [18] -1/-1/-1->3->2 [19] -1/-1/-1->3->2 [20] -1/-1/-1->3->2 [21] -1/-1/-1->3->2 [22] -1/-1/-1->3->2 [23] -1/-1/-1->3->2
+lshn-qs-g2ri-2:96978:98198 [3] NCCL INFO P2P Chunksize set to 524288
+lshn-qs-g2ri-2:96977:98195 [2] NCCL INFO Trees [0] 3/-1/-1->2->1 [1] 3/-1/-1->2->1 [2] 3/-1/-1->2->1 [3] 3/-1/-1->2->1 [4] 3/-1/-1->2->1 [5] 3/-1/-1->2->1 [6] 3/-1/-1->2->1 [7] 3/-1/-1->2->1 [8] 3/-1/-1->2->1 [9] 3/-1/-1->2->1 [10] 3/-1/-1->2->1 [11] 3/-1/-1->2->1 [12] 3/-1/-1->2->1 [13] 3/-1/-1->2->1 [14] 3/-1/-1->2->1 [15] 3/-1/-1->2->1 [16] 3/-1/-1->2->1 [17] 3/-1/-1->2->1 [18] 3/-1/-1->2->1 [19] 3/-1/-1->2->1 [20] 3/-1/-1->2->1 [21] 3/-1/-1->2->1 [22] 3/-1/-1->2->1 [23] 3/-1/-1->2->1
+lshn-qs-g2ri-2:96977:98195 [2] NCCL INFO P2P Chunksize set to 524288
+lshn-qs-g2ri-2:96975:98204 [0] NCCL INFO Channel 00/24 : 0 1 2 3
+lshn-qs-g2ri-2:96975:98204 [0] NCCL INFO Channel 01/24 : 0 1 2 3
+lshn-qs-g2ri-2:96975:98204 [0] NCCL INFO Channel 02/24 : 0 1 2 3
+lshn-qs-g2ri-2:96975:98204 [0] NCCL INFO Channel 03/24 : 0 1 2 3
+lshn-qs-g2ri-2:96975:98204 [0] NCCL INFO Channel 04/24 : 0 1 2 3
+lshn-qs-g2ri-2:96975:98204 [0] NCCL INFO Channel 05/24 : 0 1 2 3
+lshn-qs-g2ri-2:96975:98204 [0] NCCL INFO Channel 06/24 : 0 1 2 3
+lshn-qs-g2ri-2:96975:98204 [0] NCCL INFO Channel 07/24 : 0 1 2 3
+lshn-qs-g2ri-2:96975:98204 [0] NCCL INFO Channel 08/24 : 0 1 2 3
+lshn-qs-g2ri-2:96975:98204 [0] NCCL INFO Channel 09/24 : 0 1 2 3
+lshn-qs-g2ri-2:96975:98204 [0] NCCL INFO Channel 10/24 : 0 1 2 3
+lshn-qs-g2ri-2:96975:98204 [0] NCCL INFO Channel 11/24 : 0 1 2 3
+lshn-qs-g2ri-2:96975:98204 [0] NCCL INFO Channel 12/24 : 0 1 2 3
+lshn-qs-g2ri-2:96975:98204 [0] NCCL INFO Channel 13/24 : 0 1 2 3
+lshn-qs-g2ri-2:96975:98204 [0] NCCL INFO Channel 14/24 : 0 1 2 3
+lshn-qs-g2ri-2:96975:98204 [0] NCCL INFO Channel 15/24 : 0 1 2 3
+lshn-qs-g2ri-2:96975:98204 [0] NCCL INFO Channel 16/24 : 0 1 2 3
+lshn-qs-g2ri-2:96975:98204 [0] NCCL INFO Channel 17/24 : 0 1 2 3
+lshn-qs-g2ri-2:96975:98204 [0] NCCL INFO Channel 18/24 : 0 1 2 3
+lshn-qs-g2ri-2:96975:98204 [0] NCCL INFO Channel 19/24 : 0 1 2 3
+lshn-qs-g2ri-2:96975:98204 [0] NCCL INFO Channel 20/24 : 0 1 2 3
+lshn-qs-g2ri-2:96975:98204 [0] NCCL INFO Channel 21/24 : 0 1 2 3
+lshn-qs-g2ri-2:96975:98204 [0] NCCL INFO Channel 22/24 : 0 1 2 3
+lshn-qs-g2ri-2:96975:98204 [0] NCCL INFO Channel 23/24 : 0 1 2 3
+lshn-qs-g2ri-2:96975:98204 [0] NCCL INFO Trees [0] 1/-1/-1->0->-1 [1] 1/-1/-1->0->-1 [2] 1/-1/-1->0->-1 [3] 1/-1/-1->0->-1 [4] 1/-1/-1->0->-1 [5] 1/-1/-1->0->-1 [6] 1/-1/-1->0->-1 [7] 1/-1/-1->0->-1 [8] 1/-1/-1->0->-1 [9] 1/-1/-1->0->-1 [10] 1/-1/-1->0->-1 [11] 1/-1/-1->0->-1 [12] 1/-1/-1->0->-1 [13] 1/-1/-1->0->-1 [14] 1/-1/-1->0->-1 [15] 1/-1/-1->0->-1 [16] 1/-1/-1->0->-1 [17] 1/-1/-1->0->-1 [18] 1/-1/-1->0->-1 [19] 1/-1/-1->0->-1 [20] 1/-1/-1->0->-1 [21] 1/-1/-1->0->-1 [22] 1/-1/-1->0->-1 [23] 1/-1/-1->0->-1
+lshn-qs-g2ri-2:96975:98204 [0] NCCL INFO P2P Chunksize set to 524288
+lshn-qs-g2ri-2:96976:98205 [1] NCCL INFO [Proxy Service] Device 1 CPU core 104
+lshn-qs-g2ri-2:96976:98206 [1] NCCL INFO [Proxy Service UDS] Device 1 CPU core 105
+lshn-qs-g2ri-2:96977:98207 [2] NCCL INFO [Proxy Service] Device 2 CPU core 4
+lshn-qs-g2ri-2:96977:98208 [2] NCCL INFO [Proxy Service UDS] Device 2 CPU core 106
+lshn-qs-g2ri-2:96978:98209 [3] NCCL INFO [Proxy Service] Device 3 CPU core 23
+lshn-qs-g2ri-2:96978:98210 [3] NCCL INFO [Proxy Service UDS] Device 3 CPU core 26
+lshn-qs-g2ri-2:96975:98204 [0] NCCL INFO Check P2P Type isAllDirectP2p 1 directMode 0
+lshn-qs-g2ri-2:96975:98211 [0] NCCL INFO [Proxy Service] Device 0 CPU core 113
+lshn-qs-g2ri-2:96975:98212 [0] NCCL INFO [Proxy Service UDS] Device 0 CPU core 30
+lshn-qs-g2ri-2:96977:98195 [2] NCCL INFO threadThresholds 8/8/64 | 32/8/64 | 512 | 512
+lshn-qs-g2ri-2:96977:98195 [2] NCCL INFO 24 coll channels, 24 collnet channels, 0 nvls channels, 32 p2p channels, 32 p2p channels per peer
+lshn-qs-g2ri-2:96976:98200 [1] NCCL INFO threadThresholds 8/8/64 | 32/8/64 | 512 | 512
+lshn-qs-g2ri-2:96976:98200 [1] NCCL INFO 24 coll channels, 24 collnet channels, 0 nvls channels, 32 p2p channels, 32 p2p channels per peer
+lshn-qs-g2ri-2:96978:98198 [3] NCCL INFO threadThresholds 8/8/64 | 32/8/64 | 512 | 512
+lshn-qs-g2ri-2:96978:98198 [3] NCCL INFO 24 coll channels, 24 collnet channels, 0 nvls channels, 32 p2p channels, 32 p2p channels per peer
+lshn-qs-g2ri-2:96975:98204 [0] NCCL INFO threadThresholds 8/8/64 | 32/8/64 | 512 | 512
+lshn-qs-g2ri-2:96975:98204 [0] NCCL INFO 24 coll channels, 24 collnet channels, 0 nvls channels, 32 p2p channels, 32 p2p channels per peer
+lshn-qs-g2ri-2:96975:98204 [0] NCCL INFO CC Off, workFifoBytes 1048576
+lshn-qs-g2ri-2:96978:98198 [3] NCCL INFO ncclCommSplit comm 0x1f2f9af0 rank 3 nranks 4 cudaDev 3 nvmlDev 3 busId c6000 parent 0x1e24ba00 splitCount 1 color 2003953581 key 3 - Init COMPLETE
+lshn-qs-g2ri-2:96977:98195 [2] NCCL INFO ncclCommSplit comm 0x20310430 rank 2 nranks 4 cudaDev 2 nvmlDev 2 busId a2000 parent 0x1f1d4810 splitCount 1 color 2003953581 key 2 - Init COMPLETE
+lshn-qs-g2ri-2:96976:98200 [1] NCCL INFO ncclCommSplit comm 0x1eca0630 rank 1 nranks 4 cudaDev 1 nvmlDev 1 busId 7e000 parent 0x1dbf3b00 splitCount 1 color 2003953581 key 1 - Init COMPLETE
+lshn-qs-g2ri-2:96975:98204 [0] NCCL INFO ncclCommSplit comm 0x1f5ed7a0 rank 0 nranks 4 cudaDev 0 nvmlDev 0 busId 8000 parent 0x1df75b30 splitCount 1 color 2003953581 key 0 - Init COMPLETE
+lshn-qs-g2ri-2:96978:98198 [3] NCCL INFO Init timings - ncclCommSplit: rank 3 nranks 4 total 0.29 (kernels 0.00, alloc 0.00, bootstrap 0.00, allgathers 0.01, topo 0.02, graphs 0.01, connections 0.02, rest 0.23)
+lshn-qs-g2ri-2:96977:98195 [2] NCCL INFO Init timings - ncclCommSplit: rank 2 nranks 4 total 0.38 (kernels 0.00, alloc 0.00, bootstrap 0.00, allgathers 0.01, topo 0.02, graphs 0.01, connections 0.02, rest 0.32)
+lshn-qs-g2ri-2:96976:98200 [1] NCCL INFO Init timings - ncclCommSplit: rank 1 nranks 4 total 0.26 (kernels 0.00, alloc 0.00, bootstrap 0.00, allgathers 0.01, topo 0.02, graphs 0.01, connections 0.02, rest 0.20)
+lshn-qs-g2ri-2:96975:98204 [0] NCCL INFO Init timings - ncclCommSplit: rank 0 nranks 4 total 0.07 (kernels 0.00, alloc 0.00, bootstrap 0.00, allgathers 0.01, topo 0.02, graphs 0.01, connections 0.02, rest 0.01)
+[Gloo] Rank 0[Gloo] Rank  is connected to 3[Gloo] Rank 1[Gloo] Rank 32 is connected to  peer ranks. Expected number of connected peer ranks is :  is connected to 3 is connected to 33 peer ranks. 3
+ peer ranks. Expected number of connected peer ranks is :  peer ranks. Expected number of connected peer ranks is : Expected number of connected peer ranks is : 33
+3
+
+lshn-qs-g2ri-2:96978:96978 [3] NCCL INFO Rank 3 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-g2ri-2:96976:96976 [1] NCCL INFO Rank 1 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-g2ri-2:96975:96975 [0] NCCL INFO Comm config Blocking set to 1
+lshn-qs-g2ri-2:96977:96977 [2] NCCL INFO Rank 2 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-g2ri-2:96975:98230 [0] NCCL INFO Assigned NET plugin Socket to comm
+lshn-qs-g2ri-2:96978:96978 [3] NCCL INFO Rank 3 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-g2ri-2:96976:96976 [1] NCCL INFO Rank 1 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-g2ri-2:96977:96977 [2] NCCL INFO Rank 2 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-g2ri-2:96975:98230 [0] NCCL INFO Using network Socket
+lshn-qs-g2ri-2:96975:98230 [0] NCCL INFO ncclCommSplit comm 0x1f701db0 rank 0 nranks 1 cudaDev 0 nvmlDev 0 busId 8000 parent 0x1df75b30 splitCount 2 color 59908776 key 0- Init START
+lshn-qs-g2ri-2:96975:98230 [0] NCCL INFO MNNVL busId 0x8000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0
+lshn-qs-g2ri-2:96975:98230 [0] NCCL INFO Setting affinity for GPU 0 to 0-47,96-143
+lshn-qs-g2ri-2:96975:98230 [0] NCCL INFO comm 0x1f701db0 rank 0 nRanks 1 nNodes 1 localRanks 1 localRank 0 MNNVL 0
+lshn-qs-g2ri-2:96975:98230 [0] NCCL INFO Channel 00/64 : 0
+lshn-qs-g2ri-2:96975:98230 [0] NCCL INFO Channel 01/64 : 0
+lshn-qs-g2ri-2:96975:98230 [0] NCCL INFO Channel 02/64 : 0
+lshn-qs-g2ri-2:96975:98230 [0] NCCL INFO Channel 03/64 : 0
+lshn-qs-g2ri-2:96975:98230 [0] NCCL INFO Channel 04/64 : 0
+lshn-qs-g2ri-2:96975:98230 [0] NCCL INFO Channel 05/64 : 0
+lshn-qs-g2ri-2:96975:98230 [0] NCCL INFO Channel 06/64 : 0
+lshn-qs-g2ri-2:96975:98230 [0] NCCL INFO Channel 07/64 : 0
+lshn-qs-g2ri-2:96975:98230 [0] NCCL INFO Channel 08/64 : 0
+lshn-qs-g2ri-2:96975:98230 [0] NCCL INFO Channel 09/64 : 0
+lshn-qs-g2ri-2:96975:98230 [0] NCCL INFO Channel 10/64 : 0
+lshn-qs-g2ri-2:96975:98230 [0] NCCL INFO Channel 11/64 : 0
+lshn-qs-g2ri-2:96975:98230 [0] NCCL INFO Channel 12/64 : 0
+lshn-qs-g2ri-2:96975:98230 [0] NCCL INFO Channel 13/64 : 0
+lshn-qs-g2ri-2:96975:98230 [0] NCCL INFO Channel 14/64 : 0
+lshn-qs-g2ri-2:96975:98230 [0] NCCL INFO Channel 15/64 : 0
+lshn-qs-g2ri-2:96975:98230 [0] NCCL INFO Channel 16/64 : 0
+lshn-qs-g2ri-2:96975:98230 [0] NCCL INFO Channel 17/64 : 0
+lshn-qs-g2ri-2:96975:98230 [0] NCCL INFO Channel 18/64 : 0
+lshn-qs-g2ri-2:96975:98230 [0] NCCL INFO Channel 19/64 : 0
+lshn-qs-g2ri-2:96975:98230 [0] NCCL INFO Channel 20/64 : 0
+lshn-qs-g2ri-2:96975:98230 [0] NCCL INFO Channel 21/64 : 0
+lshn-qs-g2ri-2:96975:98230 [0] NCCL INFO Channel 22/64 : 0
+lshn-qs-g2ri-2:96975:98230 [0] NCCL INFO Channel 23/64 : 0
+lshn-qs-g2ri-2:96975:98230 [0] NCCL INFO Channel 24/64 : 0
+lshn-qs-g2ri-2:96975:98230 [0] NCCL INFO Channel 25/64 : 0
+lshn-qs-g2ri-2:96975:98230 [0] NCCL INFO Channel 26/64 : 0
+lshn-qs-g2ri-2:96975:98230 [0] NCCL INFO Channel 27/64 : 0
+lshn-qs-g2ri-2:96975:98230 [0] NCCL INFO Channel 28/64 : 0
+lshn-qs-g2ri-2:96975:98230 [0] NCCL INFO Channel 29/64 : 0
+lshn-qs-g2ri-2:96975:98230 [0] NCCL INFO Channel 30/64 : 0
+lshn-qs-g2ri-2:96975:98230 [0] NCCL INFO Channel 31/64 : 0
+lshn-qs-g2ri-2:96975:98230 [0] NCCL INFO Channel 32/64 : 0
+lshn-qs-g2ri-2:96975:98230 [0] NCCL INFO Channel 33/64 : 0
+lshn-qs-g2ri-2:96975:98230 [0] NCCL INFO Channel 34/64 : 0
+lshn-qs-g2ri-2:96975:98230 [0] NCCL INFO Channel 35/64 : 0
+lshn-qs-g2ri-2:96975:98230 [0] NCCL INFO Channel 36/64 : 0
+lshn-qs-g2ri-2:96975:98230 [0] NCCL INFO Channel 37/64 : 0
+lshn-qs-g2ri-2:96975:98230 [0] NCCL INFO Channel 38/64 : 0
+lshn-qs-g2ri-2:96975:98230 [0] NCCL INFO Channel 39/64 : 0
+lshn-qs-g2ri-2:96975:98230 [0] NCCL INFO Channel 40/64 : 0
+lshn-qs-g2ri-2:96975:98230 [0] NCCL INFO Channel 41/64 : 0
+lshn-qs-g2ri-2:96975:98230 [0] NCCL INFO Channel 42/64 : 0
+lshn-qs-g2ri-2:96975:98230 [0] NCCL INFO Channel 43/64 : 0
+lshn-qs-g2ri-2:96975:98230 [0] NCCL INFO Channel 44/64 : 0
+lshn-qs-g2ri-2:96975:98230 [0] NCCL INFO Channel 45/64 : 0
+lshn-qs-g2ri-2:96975:98230 [0] NCCL INFO Channel 46/64 : 0
+lshn-qs-g2ri-2:96975:98230 [0] NCCL INFO Channel 47/64 : 0
+lshn-qs-g2ri-2:96975:98230 [0] NCCL INFO Channel 48/64 : 0
+lshn-qs-g2ri-2:96975:98230 [0] NCCL INFO Channel 49/64 : 0
+lshn-qs-g2ri-2:96975:98230 [0] NCCL INFO Channel 50/64 : 0
+lshn-qs-g2ri-2:96975:98230 [0] NCCL INFO Channel 51/64 : 0
+lshn-qs-g2ri-2:96975:98230 [0] NCCL INFO Channel 52/64 : 0
+lshn-qs-g2ri-2:96975:98230 [0] NCCL INFO Channel 53/64 : 0
+lshn-qs-g2ri-2:96975:98230 [0] NCCL INFO Channel 54/64 : 0
+lshn-qs-g2ri-2:96975:98230 [0] NCCL INFO Channel 55/64 : 0
+lshn-qs-g2ri-2:96975:98230 [0] NCCL INFO Channel 56/64 : 0
+lshn-qs-g2ri-2:96975:98230 [0] NCCL INFO Channel 57/64 : 0
+lshn-qs-g2ri-2:96975:98230 [0] NCCL INFO Channel 58/64 : 0
+lshn-qs-g2ri-2:96975:98230 [0] NCCL INFO Channel 59/64 : 0
+lshn-qs-g2ri-2:96975:98230 [0] NCCL INFO Channel 60/64 : 0
+lshn-qs-g2ri-2:96975:98230 [0] NCCL INFO Channel 61/64 : 0
+lshn-qs-g2ri-2:96975:98230 [0] NCCL INFO Channel 62/64 : 0
+lshn-qs-g2ri-2:96975:98230 [0] NCCL INFO Channel 63/64 : 0
+lshn-qs-g2ri-2:96975:98230 [0] NCCL INFO Trees [0] -1/-1/-1->0->-1 [1] -1/-1/-1->0->-1 [2] -1/-1/-1->0->-1 [3] -1/-1/-1->0->-1 [4] -1/-1/-1->0->-1 [5] -1/-1/-1->0->-1 [6] -1/-1/-1->0->-1 [7] -1/-1/-1->0->-1 [8] -1/-1/-1->0->-1 [9] -1/-1/-1->0->-1 [10] -1/-1/-1->0->-1 [11] -1/-1/-1->0->-1 [12] -1/-1/-1->0->-1 [13] -1/-1/-1->0->-1 [14] -1/-1/-1->0->-1 [15] -1/-1/-1->0->-1 [16] -1/-1/-1->0->-1 [17] -1/-1/-1->0->-1 [18] -1/-1/-1->0->-1 [19] -1/-1/-1->0->-1 [20] -1/-1/-1->0->-1 [21] -1/-1/-1->0->-1 [22] -1/-1/-1->0->-1 [23] -1/-1/-1->0->-1 [24] -1/-1/-1->0->-1 [25] -1/-1/-1->0->-1 [26] -1/-1/-1->0->-1 [27] -1/-1/-1->0->-1 [28] -1/-1/-1->0->-1 [29] -1/-1/-1->0->-1 [30] -1/-1/-1->0->-1 [31] -1/-1/-1->0->-1 [32] -1/-1/-1->0->-1 [33] -1/-1/-1->0->-1 [34] -1/-1/-1->0->-1 [35] -1/-1/-1->0->-1 [36] -1/-1/-1->0->-1 [37] -1/-1/-1->0->-1 [38] -1/-1/-1->0->-1 [39] -1/-1/-1->0->-1 [40] -1/-1/-1->0->-1 [41] -1/-1/-1->0->-1 [42] -1/-1/-1->0->-1 [43] -1/-1/-1->0->-1 [44] -1/-1/-1->0->-1 [45] -1/-1/-1->0->-1 [46] -1/-1/-1->0->-1
+lshn-qs-g2ri-2:96975:98230 [0] NCCL INFO P2P Chunksize set to 524288
+lshn-qs-g2ri-2:96975:98230 [0] NCCL INFO Check P2P Type isAllDirectP2p 0 directMode 0
+lshn-qs-g2ri-2:96975:98235 [0] NCCL INFO [Proxy Service] Device 0 CPU core 3
+lshn-qs-g2ri-2:96975:98236 [0] NCCL INFO [Proxy Service UDS] Device 0 CPU core 110
+lshn-qs-g2ri-2:96975:98230 [0] NCCL INFO 64 coll channels, 64 collnet channels, 0 nvls channels, 64 p2p channels, 64 p2p channels per peer
+lshn-qs-g2ri-2:96975:98230 [0] NCCL INFO CC Off, workFifoBytes 1048576
+lshn-qs-g2ri-2:96975:98230 [0] NCCL INFO ncclCommSplit comm 0x1f701db0 rank 0 nranks 1 cudaDev 0 nvmlDev 0 busId 8000 parent 0x1df75b30 splitCount 2 color 59908776 key 0 - Init COMPLETE
+lshn-qs-g2ri-2:96975:98230 [0] NCCL INFO Init timings - ncclCommSplit: rank 0 nranks 1 total 0.03 (kernels 0.00, alloc 0.00, bootstrap 0.00, allgathers 0.00, topo 0.01, graphs 0.00, connections 0.02, rest 0.00)
+[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
+lshn-qs-g2ri-2:96975:96975 [0] NCCL INFO Rank 0 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-g2ri-2:96978:96978 [3] NCCL INFO Rank 3 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-g2ri-2:96975:96975 [0] NCCL INFO Rank 0 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-g2ri-2:96977:96977 [2] NCCL INFO Rank 2 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-g2ri-2:96976:96976 [1] NCCL INFO Comm config Blocking set to 1
+lshn-qs-g2ri-2:96976:98246 [1] NCCL INFO Assigned NET plugin Socket to comm
+lshn-qs-g2ri-2:96977:96977 [2] NCCL INFO Rank 2 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-g2ri-2:96975:96975 [0] NCCL INFO Rank 0 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-g2ri-2:96976:98246 [1] NCCL INFO Using network Socket
+lshn-qs-g2ri-2:96978:96978 [3] NCCL INFO Rank 3 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-g2ri-2:96976:98246 [1] NCCL INFO ncclCommSplit comm 0x1edb4ce0 rank 0 nranks 1 cudaDev 1 nvmlDev 1 busId 7e000 parent 0x1dbf3b00 splitCount 4 color 440515407 key 0- Init START
+lshn-qs-g2ri-2:96976:98246 [1] NCCL INFO MNNVL busId 0x7e000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0
+lshn-qs-g2ri-2:96976:98246 [1] NCCL INFO Setting affinity for GPU 1 to 0-47,96-143
+lshn-qs-g2ri-2:96976:98246 [1] NCCL INFO comm 0x1edb4ce0 rank 0 nRanks 1 nNodes 1 localRanks 1 localRank 0 MNNVL 0
+lshn-qs-g2ri-2:96976:98246 [1] NCCL INFO Channel 00/64 : 0
+lshn-qs-g2ri-2:96976:98246 [1] NCCL INFO Channel 01/64 : 0
+lshn-qs-g2ri-2:96976:98246 [1] NCCL INFO Channel 02/64 : 0
+lshn-qs-g2ri-2:96976:98246 [1] NCCL INFO Channel 03/64 : 0
+lshn-qs-g2ri-2:96976:98246 [1] NCCL INFO Channel 04/64 : 0
+lshn-qs-g2ri-2:96976:98246 [1] NCCL INFO Channel 05/64 : 0
+lshn-qs-g2ri-2:96976:98246 [1] NCCL INFO Channel 06/64 : 0
+lshn-qs-g2ri-2:96976:98246 [1] NCCL INFO Channel 07/64 : 0
+lshn-qs-g2ri-2:96976:98246 [1] NCCL INFO Channel 08/64 : 0
+lshn-qs-g2ri-2:96976:98246 [1] NCCL INFO Channel 09/64 : 0
+lshn-qs-g2ri-2:96976:98246 [1] NCCL INFO Channel 10/64 : 0
+lshn-qs-g2ri-2:96976:98246 [1] NCCL INFO Channel 11/64 : 0
+lshn-qs-g2ri-2:96976:98246 [1] NCCL INFO Channel 12/64 : 0
+lshn-qs-g2ri-2:96976:98246 [1] NCCL INFO Channel 13/64 : 0
+lshn-qs-g2ri-2:96976:98246 [1] NCCL INFO Channel 14/64 : 0
+lshn-qs-g2ri-2:96976:98246 [1] NCCL INFO Channel 15/64 : 0
+lshn-qs-g2ri-2:96976:98246 [1] NCCL INFO Channel 16/64 : 0
+lshn-qs-g2ri-2:96976:98246 [1] NCCL INFO Channel 17/64 : 0
+lshn-qs-g2ri-2:96976:98246 [1] NCCL INFO Channel 18/64 : 0
+lshn-qs-g2ri-2:96976:98246 [1] NCCL INFO Channel 19/64 : 0
+lshn-qs-g2ri-2:96976:98246 [1] NCCL INFO Channel 20/64 : 0
+lshn-qs-g2ri-2:96976:98246 [1] NCCL INFO Channel 21/64 : 0
+lshn-qs-g2ri-2:96976:98246 [1] NCCL INFO Channel 22/64 : 0
+lshn-qs-g2ri-2:96976:98246 [1] NCCL INFO Channel 23/64 : 0
+lshn-qs-g2ri-2:96976:98246 [1] NCCL INFO Channel 24/64 : 0
+lshn-qs-g2ri-2:96976:98246 [1] NCCL INFO Channel 25/64 : 0
+lshn-qs-g2ri-2:96976:98246 [1] NCCL INFO Channel 26/64 : 0
+lshn-qs-g2ri-2:96976:98246 [1] NCCL INFO Channel 27/64 : 0
+lshn-qs-g2ri-2:96976:98246 [1] NCCL INFO Channel 28/64 : 0
+lshn-qs-g2ri-2:96976:98246 [1] NCCL INFO Channel 29/64 : 0
+lshn-qs-g2ri-2:96976:98246 [1] NCCL INFO Channel 30/64 : 0
+lshn-qs-g2ri-2:96976:98246 [1] NCCL INFO Channel 31/64 : 0
+lshn-qs-g2ri-2:96976:98246 [1] NCCL INFO Channel 32/64 : 0
+lshn-qs-g2ri-2:96976:98246 [1] NCCL INFO Channel 33/64 : 0
+lshn-qs-g2ri-2:96976:98246 [1] NCCL INFO Channel 34/64 : 0
+lshn-qs-g2ri-2:96976:98246 [1] NCCL INFO Channel 35/64 : 0
+lshn-qs-g2ri-2:96976:98246 [1] NCCL INFO Channel 36/64 : 0
+lshn-qs-g2ri-2:96976:98246 [1] NCCL INFO Channel 37/64 : 0
+lshn-qs-g2ri-2:96976:98246 [1] NCCL INFO Channel 38/64 : 0
+lshn-qs-g2ri-2:96976:98246 [1] NCCL INFO Channel 39/64 : 0
+lshn-qs-g2ri-2:96976:98246 [1] NCCL INFO Channel 40/64 : 0
+lshn-qs-g2ri-2:96976:98246 [1] NCCL INFO Channel 41/64 : 0
+lshn-qs-g2ri-2:96976:98246 [1] NCCL INFO Channel 42/64 : 0
+lshn-qs-g2ri-2:96976:98246 [1] NCCL INFO Channel 43/64 : 0
+lshn-qs-g2ri-2:96976:98246 [1] NCCL INFO Channel 44/64 : 0
+lshn-qs-g2ri-2:96976:98246 [1] NCCL INFO Channel 45/64 : 0
+lshn-qs-g2ri-2:96976:98246 [1] NCCL INFO Channel 46/64 : 0
+lshn-qs-g2ri-2:96976:98246 [1] NCCL INFO Channel 47/64 : 0
+lshn-qs-g2ri-2:96976:98246 [1] NCCL INFO Channel 48/64 : 0
+lshn-qs-g2ri-2:96976:98246 [1] NCCL INFO Channel 49/64 : 0
+lshn-qs-g2ri-2:96976:98246 [1] NCCL INFO Channel 50/64 : 0
+lshn-qs-g2ri-2:96976:98246 [1] NCCL INFO Channel 51/64 : 0
+lshn-qs-g2ri-2:96976:98246 [1] NCCL INFO Channel 52/64 : 0
+lshn-qs-g2ri-2:96976:98246 [1] NCCL INFO Channel 53/64 : 0
+lshn-qs-g2ri-2:96976:98246 [1] NCCL INFO Channel 54/64 : 0
+lshn-qs-g2ri-2:96976:98246 [1] NCCL INFO Channel 55/64 : 0
+lshn-qs-g2ri-2:96976:98246 [1] NCCL INFO Channel 56/64 : 0
+lshn-qs-g2ri-2:96976:98246 [1] NCCL INFO Channel 57/64 : 0
+lshn-qs-g2ri-2:96976:98246 [1] NCCL INFO Channel 58/64 : 0
+lshn-qs-g2ri-2:96976:98246 [1] NCCL INFO Channel 59/64 : 0
+lshn-qs-g2ri-2:96976:98246 [1] NCCL INFO Channel 60/64 : 0
+lshn-qs-g2ri-2:96976:98246 [1] NCCL INFO Channel 61/64 : 0
+lshn-qs-g2ri-2:96976:98246 [1] NCCL INFO Channel 62/64 : 0
+lshn-qs-g2ri-2:96976:98246 [1] NCCL INFO Channel 63/64 : 0
+lshn-qs-g2ri-2:96976:98246 [1] NCCL INFO Trees [0] -1/-1/-1->0->-1 [1] -1/-1/-1->0->-1 [2] -1/-1/-1->0->-1 [3] -1/-1/-1->0->-1 [4] -1/-1/-1->0->-1 [5] -1/-1/-1->0->-1 [6] -1/-1/-1->0->-1 [7] -1/-1/-1->0->-1 [8] -1/-1/-1->0->-1 [9] -1/-1/-1->0->-1 [10] -1/-1/-1->0->-1 [11] -1/-1/-1->0->-1 [12] -1/-1/-1->0->-1 [13] -1/-1/-1->0->-1 [14] -1/-1/-1->0->-1 [15] -1/-1/-1->0->-1 [16] -1/-1/-1->0->-1 [17] -1/-1/-1->0->-1 [18] -1/-1/-1->0->-1 [19] -1/-1/-1->0->-1 [20] -1/-1/-1->0->-1 [21] -1/-1/-1->0->-1 [22] -1/-1/-1->0->-1 [23] -1/-1/-1->0->-1 [24] -1/-1/-1->0->-1 [25] -1/-1/-1->0->-1 [26] -1/-1/-1->0->-1 [27] -1/-1/-1->0->-1 [28] -1/-1/-1->0->-1 [29] -1/-1/-1->0->-1 [30] -1/-1/-1->0->-1 [31] -1/-1/-1->0->-1 [32] -1/-1/-1->0->-1 [33] -1/-1/-1->0->-1 [34] -1/-1/-1->0->-1 [35] -1/-1/-1->0->-1 [36] -1/-1/-1->0->-1 [37] -1/-1/-1->0->-1 [38] -1/-1/-1->0->-1 [39] -1/-1/-1->0->-1 [40] -1/-1/-1->0->-1 [41] -1/-1/-1->0->-1 [42] -1/-1/-1->0->-1 [43] -1/-1/-1->0->-1 [44] -1/-1/-1->0->-1 [45] -1/-1/-1->0->-1 [46] -1/-1/-1->0->-1
+lshn-qs-g2ri-2:96976:98246 [1] NCCL INFO P2P Chunksize set to 524288
+lshn-qs-g2ri-2:96976:98246 [1] NCCL INFO Check P2P Type isAllDirectP2p 0 directMode 0
+lshn-qs-g2ri-2:96976:98250 [1] NCCL INFO [Proxy Service] Device 1 CPU core 32
+lshn-qs-g2ri-2:96976:98251 [1] NCCL INFO [Proxy Service UDS] Device 1 CPU core 136
+lshn-qs-g2ri-2:96976:98246 [1] NCCL INFO 64 coll channels, 64 collnet channels, 0 nvls channels, 64 p2p channels, 64 p2p channels per peer
+lshn-qs-g2ri-2:96976:98246 [1] NCCL INFO CC Off, workFifoBytes 1048576
+lshn-qs-g2ri-2:96976:98246 [1] NCCL INFO ncclCommSplit comm 0x1edb4ce0 rank 0 nranks 1 cudaDev 1 nvmlDev 1 busId 7e000 parent 0x1dbf3b00 splitCount 4 color 440515407 key 0 - Init COMPLETE
+lshn-qs-g2ri-2:96976:98246 [1] NCCL INFO Init timings - ncclCommSplit: rank 0 nranks 1 total 0.04 (kernels 0.00, alloc 0.00, bootstrap 0.00, allgathers 0.00, topo 0.01, graphs 0.00, connections 0.03, rest 0.00)
+[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
+lshn-qs-g2ri-2:96976:96976 [1] NCCL INFO Rank 1 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-g2ri-2:96975:96975 [0] NCCL INFO Rank 0 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-g2ri-2:96976:96976 [1] NCCL INFO Rank 1 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-g2ri-2:96978:96978 [3] NCCL INFO Rank 3 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-g2ri-2:96977:96977 [2] NCCL INFO Comm config Blocking set to 1
+lshn-qs-g2ri-2:96977:98261 [2] NCCL INFO Assigned NET plugin Socket to comm
+lshn-qs-g2ri-2:96976:96976 [1] NCCL INFO Rank 1 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-g2ri-2:96977:98261 [2] NCCL INFO Using network Socket
+lshn-qs-g2ri-2:96975:96975 [0] NCCL INFO Rank 0 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-g2ri-2:96978:96978 [3] NCCL INFO Rank 3 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-g2ri-2:96977:98261 [2] NCCL INFO ncclCommSplit comm 0x20423ff0 rank 0 nranks 1 cudaDev 2 nvmlDev 2 busId a2000 parent 0x1f1d4810 splitCount 6 color 1227022723 key 0- Init START
+lshn-qs-g2ri-2:96977:98261 [2] NCCL INFO MNNVL busId 0xa2000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0
+lshn-qs-g2ri-2:96977:98261 [2] NCCL INFO Setting affinity for GPU 2 to 0-47,96-143
+lshn-qs-g2ri-2:96977:98261 [2] NCCL INFO comm 0x20423ff0 rank 0 nRanks 1 nNodes 1 localRanks 1 localRank 0 MNNVL 0
+lshn-qs-g2ri-2:96977:98261 [2] NCCL INFO Channel 00/64 : 0
+lshn-qs-g2ri-2:96977:98261 [2] NCCL INFO Channel 01/64 : 0
+lshn-qs-g2ri-2:96977:98261 [2] NCCL INFO Channel 02/64 : 0
+lshn-qs-g2ri-2:96977:98261 [2] NCCL INFO Channel 03/64 : 0
+lshn-qs-g2ri-2:96977:98261 [2] NCCL INFO Channel 04/64 : 0
+lshn-qs-g2ri-2:96977:98261 [2] NCCL INFO Channel 05/64 : 0
+lshn-qs-g2ri-2:96977:98261 [2] NCCL INFO Channel 06/64 : 0
+lshn-qs-g2ri-2:96977:98261 [2] NCCL INFO Channel 07/64 : 0
+lshn-qs-g2ri-2:96977:98261 [2] NCCL INFO Channel 08/64 : 0
+lshn-qs-g2ri-2:96977:98261 [2] NCCL INFO Channel 09/64 : 0
+lshn-qs-g2ri-2:96977:98261 [2] NCCL INFO Channel 10/64 : 0
+lshn-qs-g2ri-2:96977:98261 [2] NCCL INFO Channel 11/64 : 0
+lshn-qs-g2ri-2:96977:98261 [2] NCCL INFO Channel 12/64 : 0
+lshn-qs-g2ri-2:96977:98261 [2] NCCL INFO Channel 13/64 : 0
+lshn-qs-g2ri-2:96977:98261 [2] NCCL INFO Channel 14/64 : 0
+lshn-qs-g2ri-2:96977:98261 [2] NCCL INFO Channel 15/64 : 0
+lshn-qs-g2ri-2:96977:98261 [2] NCCL INFO Channel 16/64 : 0
+lshn-qs-g2ri-2:96977:98261 [2] NCCL INFO Channel 17/64 : 0
+lshn-qs-g2ri-2:96977:98261 [2] NCCL INFO Channel 18/64 : 0
+lshn-qs-g2ri-2:96977:98261 [2] NCCL INFO Channel 19/64 : 0
+lshn-qs-g2ri-2:96977:98261 [2] NCCL INFO Channel 20/64 : 0
+lshn-qs-g2ri-2:96977:98261 [2] NCCL INFO Channel 21/64 : 0
+lshn-qs-g2ri-2:96977:98261 [2] NCCL INFO Channel 22/64 : 0
+lshn-qs-g2ri-2:96977:98261 [2] NCCL INFO Channel 23/64 : 0
+lshn-qs-g2ri-2:96977:98261 [2] NCCL INFO Channel 24/64 : 0
+lshn-qs-g2ri-2:96977:98261 [2] NCCL INFO Channel 25/64 : 0
+lshn-qs-g2ri-2:96977:98261 [2] NCCL INFO Channel 26/64 : 0
+lshn-qs-g2ri-2:96977:98261 [2] NCCL INFO Channel 27/64 : 0
+lshn-qs-g2ri-2:96977:98261 [2] NCCL INFO Channel 28/64 : 0
+lshn-qs-g2ri-2:96977:98261 [2] NCCL INFO Channel 29/64 : 0
+lshn-qs-g2ri-2:96977:98261 [2] NCCL INFO Channel 30/64 : 0
+lshn-qs-g2ri-2:96977:98261 [2] NCCL INFO Channel 31/64 : 0
+lshn-qs-g2ri-2:96977:98261 [2] NCCL INFO Channel 32/64 : 0
+lshn-qs-g2ri-2:96977:98261 [2] NCCL INFO Channel 33/64 : 0
+lshn-qs-g2ri-2:96977:98261 [2] NCCL INFO Channel 34/64 : 0
+lshn-qs-g2ri-2:96977:98261 [2] NCCL INFO Channel 35/64 : 0
+lshn-qs-g2ri-2:96977:98261 [2] NCCL INFO Channel 36/64 : 0
+lshn-qs-g2ri-2:96977:98261 [2] NCCL INFO Channel 37/64 : 0
+lshn-qs-g2ri-2:96977:98261 [2] NCCL INFO Channel 38/64 : 0
+lshn-qs-g2ri-2:96977:98261 [2] NCCL INFO Channel 39/64 : 0
+lshn-qs-g2ri-2:96977:98261 [2] NCCL INFO Channel 40/64 : 0
+lshn-qs-g2ri-2:96977:98261 [2] NCCL INFO Channel 41/64 : 0
+lshn-qs-g2ri-2:96977:98261 [2] NCCL INFO Channel 42/64 : 0
+lshn-qs-g2ri-2:96977:98261 [2] NCCL INFO Channel 43/64 : 0
+lshn-qs-g2ri-2:96977:98261 [2] NCCL INFO Channel 44/64 : 0
+lshn-qs-g2ri-2:96977:98261 [2] NCCL INFO Channel 45/64 : 0
+lshn-qs-g2ri-2:96977:98261 [2] NCCL INFO Channel 46/64 : 0
+lshn-qs-g2ri-2:96977:98261 [2] NCCL INFO Channel 47/64 : 0
+lshn-qs-g2ri-2:96977:98261 [2] NCCL INFO Channel 48/64 : 0
+lshn-qs-g2ri-2:96977:98261 [2] NCCL INFO Channel 49/64 : 0
+lshn-qs-g2ri-2:96977:98261 [2] NCCL INFO Channel 50/64 : 0
+lshn-qs-g2ri-2:96977:98261 [2] NCCL INFO Channel 51/64 : 0
+lshn-qs-g2ri-2:96977:98261 [2] NCCL INFO Channel 52/64 : 0
+lshn-qs-g2ri-2:96977:98261 [2] NCCL INFO Channel 53/64 : 0
+lshn-qs-g2ri-2:96977:98261 [2] NCCL INFO Channel 54/64 : 0
+lshn-qs-g2ri-2:96977:98261 [2] NCCL INFO Channel 55/64 : 0
+lshn-qs-g2ri-2:96977:98261 [2] NCCL INFO Channel 56/64 : 0
+lshn-qs-g2ri-2:96977:98261 [2] NCCL INFO Channel 57/64 : 0
+lshn-qs-g2ri-2:96977:98261 [2] NCCL INFO Channel 58/64 : 0
+lshn-qs-g2ri-2:96977:98261 [2] NCCL INFO Channel 59/64 : 0
+lshn-qs-g2ri-2:96977:98261 [2] NCCL INFO Channel 60/64 : 0
+lshn-qs-g2ri-2:96977:98261 [2] NCCL INFO Channel 61/64 : 0
+lshn-qs-g2ri-2:96977:98261 [2] NCCL INFO Channel 62/64 : 0
+lshn-qs-g2ri-2:96977:98261 [2] NCCL INFO Channel 63/64 : 0
+lshn-qs-g2ri-2:96977:98261 [2] NCCL INFO Trees [0] -1/-1/-1->0->-1 [1] -1/-1/-1->0->-1 [2] -1/-1/-1->0->-1 [3] -1/-1/-1->0->-1 [4] -1/-1/-1->0->-1 [5] -1/-1/-1->0->-1 [6] -1/-1/-1->0->-1 [7] -1/-1/-1->0->-1 [8] -1/-1/-1->0->-1 [9] -1/-1/-1->0->-1 [10] -1/-1/-1->0->-1 [11] -1/-1/-1->0->-1 [12] -1/-1/-1->0->-1 [13] -1/-1/-1->0->-1 [14] -1/-1/-1->0->-1 [15] -1/-1/-1->0->-1 [16] -1/-1/-1->0->-1 [17] -1/-1/-1->0->-1 [18] -1/-1/-1->0->-1 [19] -1/-1/-1->0->-1 [20] -1/-1/-1->0->-1 [21] -1/-1/-1->0->-1 [22] -1/-1/-1->0->-1 [23] -1/-1/-1->0->-1 [24] -1/-1/-1->0->-1 [25] -1/-1/-1->0->-1 [26] -1/-1/-1->0->-1 [27] -1/-1/-1->0->-1 [28] -1/-1/-1->0->-1 [29] -1/-1/-1->0->-1 [30] -1/-1/-1->0->-1 [31] -1/-1/-1->0->-1 [32] -1/-1/-1->0->-1 [33] -1/-1/-1->0->-1 [34] -1/-1/-1->0->-1 [35] -1/-1/-1->0->-1 [36] -1/-1/-1->0->-1 [37] -1/-1/-1->0->-1 [38] -1/-1/-1->0->-1 [39] -1/-1/-1->0->-1 [40] -1/-1/-1->0->-1 [41] -1/-1/-1->0->-1 [42] -1/-1/-1->0->-1 [43] -1/-1/-1->0->-1 [44] -1/-1/-1->0->-1 [45] -1/-1/-1->0->-1 [46] -1/-1/-1->0->-1
+lshn-qs-g2ri-2:96977:98261 [2] NCCL INFO P2P Chunksize set to 524288
+lshn-qs-g2ri-2:96977:98261 [2] NCCL INFO Check P2P Type isAllDirectP2p 0 directMode 0
+lshn-qs-g2ri-2:96977:98265 [2] NCCL INFO [Proxy Service] Device 2 CPU core 19
+lshn-qs-g2ri-2:96977:98266 [2] NCCL INFO [Proxy Service UDS] Device 2 CPU core 41
+lshn-qs-g2ri-2:96977:98261 [2] NCCL INFO 64 coll channels, 64 collnet channels, 0 nvls channels, 64 p2p channels, 64 p2p channels per peer
+lshn-qs-g2ri-2:96977:98261 [2] NCCL INFO CC Off, workFifoBytes 1048576
+lshn-qs-g2ri-2:96977:98261 [2] NCCL INFO ncclCommSplit comm 0x20423ff0 rank 0 nranks 1 cudaDev 2 nvmlDev 2 busId a2000 parent 0x1f1d4810 splitCount 6 color 1227022723 key 0 - Init COMPLETE
+lshn-qs-g2ri-2:96977:98261 [2] NCCL INFO Init timings - ncclCommSplit: rank 0 nranks 1 total 0.03 (kernels 0.00, alloc 0.00, bootstrap 0.00, allgathers 0.00, topo 0.01, graphs 0.00, connections 0.02, rest 0.00)
+[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
+lshn-qs-g2ri-2:96977:96977 [2] NCCL INFO Rank 2 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-g2ri-2:96977:96977 [2] NCCL INFO Rank 2 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-g2ri-2:96975:96975 [0] NCCL INFO Rank 0 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-g2ri-2:96976:96976 [1] NCCL INFO Rank 1 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-g2ri-2:96978:96978 [3] NCCL INFO Comm config Blocking set to 1
+lshn-qs-g2ri-2:96978:98276 [3] NCCL INFO Assigned NET plugin Socket to comm
+lshn-qs-g2ri-2:96978:98276 [3] NCCL INFO Using network Socket
+lshn-qs-g2ri-2:96977:96977 [2] NCCL INFO Rank 2 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-g2ri-2:96976:96976 [1] NCCL INFO Rank 1 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-g2ri-2:96978:98276 [3] NCCL INFO ncclCommSplit comm 0x1f40c020 rank 0 nranks 1 cudaDev 3 nvmlDev 3 busId c6000 parent 0x1e24ba00 splitCount 8 color 1301067556 key 0- Init START
+lshn-qs-g2ri-2:96975:96975 [0] NCCL INFO Comm config Blocking set to 1
+lshn-qs-g2ri-2:96978:98276 [3] NCCL INFO MNNVL busId 0xc6000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0
+lshn-qs-g2ri-2:96978:98276 [3] NCCL INFO Setting affinity for GPU 3 to 0-47,96-143
+lshn-qs-g2ri-2:96978:98276 [3] NCCL INFO comm 0x1f40c020 rank 0 nRanks 1 nNodes 1 localRanks 1 localRank 0 MNNVL 0
+lshn-qs-g2ri-2:96978:98276 [3] NCCL INFO Channel 00/64 : 0
+lshn-qs-g2ri-2:96978:98276 [3] NCCL INFO Channel 01/64 : 0
+lshn-qs-g2ri-2:96978:98276 [3] NCCL INFO Channel 02/64 : 0
+lshn-qs-g2ri-2:96978:98276 [3] NCCL INFO Channel 03/64 : 0
+lshn-qs-g2ri-2:96978:98276 [3] NCCL INFO Channel 04/64 : 0
+lshn-qs-g2ri-2:96978:98276 [3] NCCL INFO Channel 05/64 : 0
+lshn-qs-g2ri-2:96978:98276 [3] NCCL INFO Channel 06/64 : 0
+lshn-qs-g2ri-2:96978:98276 [3] NCCL INFO Channel 07/64 : 0
+lshn-qs-g2ri-2:96978:98276 [3] NCCL INFO Channel 08/64 : 0
+lshn-qs-g2ri-2:96978:98276 [3] NCCL INFO Channel 09/64 : 0
+lshn-qs-g2ri-2:96978:98276 [3] NCCL INFO Channel 10/64 : 0
+lshn-qs-g2ri-2:96978:98276 [3] NCCL INFO Channel 11/64 : 0
+lshn-qs-g2ri-2:96978:98276 [3] NCCL INFO Channel 12/64 : 0
+lshn-qs-g2ri-2:96978:98276 [3] NCCL INFO Channel 13/64 : 0
+lshn-qs-g2ri-2:96978:98276 [3] NCCL INFO Channel 14/64 : 0
+lshn-qs-g2ri-2:96978:98276 [3] NCCL INFO Channel 15/64 : 0
+lshn-qs-g2ri-2:96978:98276 [3] NCCL INFO Channel 16/64 : 0
+lshn-qs-g2ri-2:96978:98276 [3] NCCL INFO Channel 17/64 : 0
+lshn-qs-g2ri-2:96978:98276 [3] NCCL INFO Channel 18/64 : 0
+lshn-qs-g2ri-2:96978:98276 [3] NCCL INFO Channel 19/64 : 0
+lshn-qs-g2ri-2:96978:98276 [3] NCCL INFO Channel 20/64 : 0
+lshn-qs-g2ri-2:96978:98276 [3] NCCL INFO Channel 21/64 : 0
+lshn-qs-g2ri-2:96978:98276 [3] NCCL INFO Channel 22/64 : 0
+lshn-qs-g2ri-2:96978:98276 [3] NCCL INFO Channel 23/64 : 0
+lshn-qs-g2ri-2:96978:98276 [3] NCCL INFO Channel 24/64 : 0
+lshn-qs-g2ri-2:96978:98276 [3] NCCL INFO Channel 25/64 : 0
+lshn-qs-g2ri-2:96978:98276 [3] NCCL INFO Channel 26/64 : 0
+lshn-qs-g2ri-2:96978:98276 [3] NCCL INFO Channel 27/64 : 0
+lshn-qs-g2ri-2:96978:98276 [3] NCCL INFO Channel 28/64 : 0
+lshn-qs-g2ri-2:96978:98276 [3] NCCL INFO Channel 29/64 : 0
+lshn-qs-g2ri-2:96978:98276 [3] NCCL INFO Channel 30/64 : 0
+lshn-qs-g2ri-2:96978:98276 [3] NCCL INFO Channel 31/64 : 0
+lshn-qs-g2ri-2:96978:98276 [3] NCCL INFO Channel 32/64 : 0
+lshn-qs-g2ri-2:96978:98276 [3] NCCL INFO Channel 33/64 : 0
+lshn-qs-g2ri-2:96978:98276 [3] NCCL INFO Channel 34/64 : 0
+lshn-qs-g2ri-2:96978:98276 [3] NCCL INFO Channel 35/64 : 0
+lshn-qs-g2ri-2:96978:98276 [3] NCCL INFO Channel 36/64 : 0
+lshn-qs-g2ri-2:96978:98276 [3] NCCL INFO Channel 37/64 : 0
+lshn-qs-g2ri-2:96978:98276 [3] NCCL INFO Channel 38/64 : 0
+lshn-qs-g2ri-2:96978:98276 [3] NCCL INFO Channel 39/64 : 0
+lshn-qs-g2ri-2:96978:98276 [3] NCCL INFO Channel 40/64 : 0
+lshn-qs-g2ri-2:96978:98276 [3] NCCL INFO Channel 41/64 : 0
+lshn-qs-g2ri-2:96978:98276 [3] NCCL INFO Channel 42/64 : 0
+lshn-qs-g2ri-2:96978:98276 [3] NCCL INFO Channel 43/64 : 0
+lshn-qs-g2ri-2:96978:98276 [3] NCCL INFO Channel 44/64 : 0
+lshn-qs-g2ri-2:96978:98276 [3] NCCL INFO Channel 45/64 : 0
+lshn-qs-g2ri-2:96978:98276 [3] NCCL INFO Channel 46/64 : 0
+lshn-qs-g2ri-2:96978:98276 [3] NCCL INFO Channel 47/64 : 0
+lshn-qs-g2ri-2:96978:98276 [3] NCCL INFO Channel 48/64 : 0
+lshn-qs-g2ri-2:96978:98276 [3] NCCL INFO Channel 49/64 : 0
+lshn-qs-g2ri-2:96978:98276 [3] NCCL INFO Channel 50/64 : 0
+lshn-qs-g2ri-2:96978:98276 [3] NCCL INFO Channel 51/64 : 0
+lshn-qs-g2ri-2:96978:98276 [3] NCCL INFO Channel 52/64 : 0
+lshn-qs-g2ri-2:96978:98276 [3] NCCL INFO Channel 53/64 : 0
+lshn-qs-g2ri-2:96978:98276 [3] NCCL INFO Channel 54/64 : 0
+lshn-qs-g2ri-2:96978:98276 [3] NCCL INFO Channel 55/64 : 0
+lshn-qs-g2ri-2:96978:98276 [3] NCCL INFO Channel 56/64 : 0
+lshn-qs-g2ri-2:96978:98276 [3] NCCL INFO Channel 57/64 : 0
+lshn-qs-g2ri-2:96978:98276 [3] NCCL INFO Channel 58/64 : 0
+lshn-qs-g2ri-2:96978:98276 [3] NCCL INFO Channel 59/64 : 0
+lshn-qs-g2ri-2:96978:98276 [3] NCCL INFO Channel 60/64 : 0
+lshn-qs-g2ri-2:96978:98276 [3] NCCL INFO Channel 61/64 : 0
+lshn-qs-g2ri-2:96978:98276 [3] NCCL INFO Channel 62/64 : 0
+lshn-qs-g2ri-2:96978:98276 [3] NCCL INFO Channel 63/64 : 0
+lshn-qs-g2ri-2:96978:98276 [3] NCCL INFO Trees [0] -1/-1/-1->0->-1 [1] -1/-1/-1->0->-1 [2] -1/-1/-1->0->-1 [3] -1/-1/-1->0->-1 [4] -1/-1/-1->0->-1 [5] -1/-1/-1->0->-1 [6] -1/-1/-1->0->-1 [7] -1/-1/-1->0->-1 [8] -1/-1/-1->0->-1 [9] -1/-1/-1->0->-1 [10] -1/-1/-1->0->-1 [11] -1/-1/-1->0->-1 [12] -1/-1/-1->0->-1 [13] -1/-1/-1->0->-1 [14] -1/-1/-1->0->-1 [15] -1/-1/-1->0->-1 [16] -1/-1/-1->0->-1 [17] -1/-1/-1->0->-1 [18] -1/-1/-1->0->-1 [19] -1/-1/-1->0->-1 [20] -1/-1/-1->0->-1 [21] -1/-1/-1->0->-1 [22] -1/-1/-1->0->-1 [23] -1/-1/-1->0->-1 [24] -1/-1/-1->0->-1 [25] -1/-1/-1->0->-1 [26] -1/-1/-1->0->-1 [27] -1/-1/-1->0->-1 [28] -1/-1/-1->0->-1 [29] -1/-1/-1->0->-1 [30] -1/-1/-1->0->-1 [31] -1/-1/-1->0->-1 [32] -1/-1/-1->0->-1 [33] -1/-1/-1->0->-1 [34] -1/-1/-1->0->-1 [35] -1/-1/-1->0->-1 [36] -1/-1/-1->0->-1 [37] -1/-1/-1->0->-1 [38] -1/-1/-1->0->-1 [39] -1/-1/-1->0->-1 [40] -1/-1/-1->0->-1 [41] -1/-1/-1->0->-1 [42] -1/-1/-1->0->-1 [43] -1/-1/-1->0->-1 [44] -1/-1/-1->0->-1 [45] -1/-1/-1->0->-1 [46] -1/-1/-1->0->-1
+lshn-qs-g2ri-2:96978:98276 [3] NCCL INFO P2P Chunksize set to 524288
+lshn-qs-g2ri-2:96978:98276 [3] NCCL INFO Check P2P Type isAllDirectP2p 0 directMode 0
+lshn-qs-g2ri-2:96978:98282 [3] NCCL INFO [Proxy Service] Device 3 CPU core 29
+lshn-qs-g2ri-2:96978:98283 [3] NCCL INFO [Proxy Service UDS] Device 3 CPU core 105
+lshn-qs-g2ri-2:96978:98276 [3] NCCL INFO 64 coll channels, 64 collnet channels, 0 nvls channels, 64 p2p channels, 64 p2p channels per peer
+lshn-qs-g2ri-2:96978:98276 [3] NCCL INFO CC Off, workFifoBytes 1048576
+lshn-qs-g2ri-2:96978:98276 [3] NCCL INFO ncclCommSplit comm 0x1f40c020 rank 0 nranks 1 cudaDev 3 nvmlDev 3 busId c6000 parent 0x1e24ba00 splitCount 8 color 1301067556 key 0 - Init COMPLETE
+lshn-qs-g2ri-2:96978:98276 [3] NCCL INFO Init timings - ncclCommSplit: rank 0 nranks 1 total 0.03 (kernels 0.00, alloc 0.00, bootstrap 0.00, allgathers 0.00, topo 0.01, graphs 0.00, connections 0.02, rest 0.00)
+[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
+lshn-qs-g2ri-2:96978:96978 [3] NCCL INFO Rank 3 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-g2ri-2:96975:98281 [0] NCCL INFO Assigned NET plugin Socket to comm
+lshn-qs-g2ri-2:96975:98281 [0] NCCL INFO Using network Socket
+lshn-qs-g2ri-2:96978:96978 [3] NCCL INFO Rank 3 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-g2ri-2:96977:96977 [2] NCCL INFO Rank 2 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-g2ri-2:96975:98281 [0] NCCL INFO ncclCommSplit comm 0x20e5a6a0 rank 0 nranks 1 cudaDev 0 nvmlDev 0 busId 8000 parent 0x1df75b30 splitCount 9 color 59908776 key 0- Init START
+lshn-qs-g2ri-2:96976:96976 [1] NCCL INFO Rank 1 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-g2ri-2:96975:98281 [0] NCCL INFO MNNVL busId 0x8000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0
+lshn-qs-g2ri-2:96975:98281 [0] NCCL INFO Setting affinity for GPU 0 to 0-47,96-143
+lshn-qs-g2ri-2:96975:98281 [0] NCCL INFO comm 0x20e5a6a0 rank 0 nRanks 1 nNodes 1 localRanks 1 localRank 0 MNNVL 0
+lshn-qs-g2ri-2:96975:98281 [0] NCCL INFO Channel 00/64 : 0
+lshn-qs-g2ri-2:96975:98281 [0] NCCL INFO Channel 01/64 : 0
+lshn-qs-g2ri-2:96975:98281 [0] NCCL INFO Channel 02/64 : 0
+lshn-qs-g2ri-2:96975:98281 [0] NCCL INFO Channel 03/64 : 0
+lshn-qs-g2ri-2:96975:98281 [0] NCCL INFO Channel 04/64 : 0
+lshn-qs-g2ri-2:96975:98281 [0] NCCL INFO Channel 05/64 : 0
+lshn-qs-g2ri-2:96975:98281 [0] NCCL INFO Channel 06/64 : 0
+lshn-qs-g2ri-2:96975:98281 [0] NCCL INFO Channel 07/64 : 0
+lshn-qs-g2ri-2:96975:98281 [0] NCCL INFO Channel 08/64 : 0
+lshn-qs-g2ri-2:96975:98281 [0] NCCL INFO Channel 09/64 : 0
+lshn-qs-g2ri-2:96975:98281 [0] NCCL INFO Channel 10/64 : 0
+lshn-qs-g2ri-2:96975:98281 [0] NCCL INFO Channel 11/64 : 0
+lshn-qs-g2ri-2:96975:98281 [0] NCCL INFO Channel 12/64 : 0
+lshn-qs-g2ri-2:96975:98281 [0] NCCL INFO Channel 13/64 : 0
+lshn-qs-g2ri-2:96975:98281 [0] NCCL INFO Channel 14/64 : 0
+lshn-qs-g2ri-2:96975:98281 [0] NCCL INFO Channel 15/64 : 0
+lshn-qs-g2ri-2:96975:98281 [0] NCCL INFO Channel 16/64 : 0
+lshn-qs-g2ri-2:96975:98281 [0] NCCL INFO Channel 17/64 : 0
+lshn-qs-g2ri-2:96975:98281 [0] NCCL INFO Channel 18/64 : 0
+lshn-qs-g2ri-2:96975:98281 [0] NCCL INFO Channel 19/64 : 0
+lshn-qs-g2ri-2:96975:98281 [0] NCCL INFO Channel 20/64 : 0
+lshn-qs-g2ri-2:96975:98281 [0] NCCL INFO Channel 21/64 : 0
+lshn-qs-g2ri-2:96975:98281 [0] NCCL INFO Channel 22/64 : 0
+lshn-qs-g2ri-2:96975:98281 [0] NCCL INFO Channel 23/64 : 0
+lshn-qs-g2ri-2:96975:98281 [0] NCCL INFO Channel 24/64 : 0
+lshn-qs-g2ri-2:96975:98281 [0] NCCL INFO Channel 25/64 : 0
+lshn-qs-g2ri-2:96975:98281 [0] NCCL INFO Channel 26/64 : 0
+lshn-qs-g2ri-2:96975:98281 [0] NCCL INFO Channel 27/64 : 0
+lshn-qs-g2ri-2:96975:98281 [0] NCCL INFO Channel 28/64 : 0
+lshn-qs-g2ri-2:96975:98281 [0] NCCL INFO Channel 29/64 : 0
+lshn-qs-g2ri-2:96975:98281 [0] NCCL INFO Channel 30/64 : 0
+lshn-qs-g2ri-2:96975:98281 [0] NCCL INFO Channel 31/64 : 0
+lshn-qs-g2ri-2:96975:98281 [0] NCCL INFO Channel 32/64 : 0
+lshn-qs-g2ri-2:96975:98281 [0] NCCL INFO Channel 33/64 : 0
+lshn-qs-g2ri-2:96975:98281 [0] NCCL INFO Channel 34/64 : 0
+lshn-qs-g2ri-2:96975:98281 [0] NCCL INFO Channel 35/64 : 0
+lshn-qs-g2ri-2:96975:98281 [0] NCCL INFO Channel 36/64 : 0
+lshn-qs-g2ri-2:96975:98281 [0] NCCL INFO Channel 37/64 : 0
+lshn-qs-g2ri-2:96975:98281 [0] NCCL INFO Channel 38/64 : 0
+lshn-qs-g2ri-2:96975:98281 [0] NCCL INFO Channel 39/64 : 0
+lshn-qs-g2ri-2:96975:98281 [0] NCCL INFO Channel 40/64 : 0
+lshn-qs-g2ri-2:96975:98281 [0] NCCL INFO Channel 41/64 : 0
+lshn-qs-g2ri-2:96975:98281 [0] NCCL INFO Channel 42/64 : 0
+lshn-qs-g2ri-2:96975:98281 [0] NCCL INFO Channel 43/64 : 0
+lshn-qs-g2ri-2:96975:98281 [0] NCCL INFO Channel 44/64 : 0
+lshn-qs-g2ri-2:96975:98281 [0] NCCL INFO Channel 45/64 : 0
+lshn-qs-g2ri-2:96975:98281 [0] NCCL INFO Channel 46/64 : 0
+lshn-qs-g2ri-2:96975:98281 [0] NCCL INFO Channel 47/64 : 0
+lshn-qs-g2ri-2:96975:98281 [0] NCCL INFO Channel 48/64 : 0
+lshn-qs-g2ri-2:96975:98281 [0] NCCL INFO Channel 49/64 : 0
+lshn-qs-g2ri-2:96975:98281 [0] NCCL INFO Channel 50/64 : 0
+lshn-qs-g2ri-2:96975:98281 [0] NCCL INFO Channel 51/64 : 0
+lshn-qs-g2ri-2:96975:98281 [0] NCCL INFO Channel 52/64 : 0
+lshn-qs-g2ri-2:96975:98281 [0] NCCL INFO Channel 53/64 : 0
+lshn-qs-g2ri-2:96975:98281 [0] NCCL INFO Channel 54/64 : 0
+lshn-qs-g2ri-2:96975:98281 [0] NCCL INFO Channel 55/64 : 0
+lshn-qs-g2ri-2:96975:98281 [0] NCCL INFO Channel 56/64 : 0
+lshn-qs-g2ri-2:96975:98281 [0] NCCL INFO Channel 57/64 : 0
+lshn-qs-g2ri-2:96975:98281 [0] NCCL INFO Channel 58/64 : 0
+lshn-qs-g2ri-2:96975:98281 [0] NCCL INFO Channel 59/64 : 0
+lshn-qs-g2ri-2:96975:98281 [0] NCCL INFO Channel 60/64 : 0
+lshn-qs-g2ri-2:96975:98281 [0] NCCL INFO Channel 61/64 : 0
+lshn-qs-g2ri-2:96975:98281 [0] NCCL INFO Channel 62/64 : 0
+lshn-qs-g2ri-2:96975:98281 [0] NCCL INFO Channel 63/64 : 0
+lshn-qs-g2ri-2:96975:98281 [0] NCCL INFO Trees [0] -1/-1/-1->0->-1 [1] -1/-1/-1->0->-1 [2] -1/-1/-1->0->-1 [3] -1/-1/-1->0->-1 [4] -1/-1/-1->0->-1 [5] -1/-1/-1->0->-1 [6] -1/-1/-1->0->-1 [7] -1/-1/-1->0->-1 [8] -1/-1/-1->0->-1 [9] -1/-1/-1->0->-1 [10] -1/-1/-1->0->-1 [11] -1/-1/-1->0->-1 [12] -1/-1/-1->0->-1 [13] -1/-1/-1->0->-1 [14] -1/-1/-1->0->-1 [15] -1/-1/-1->0->-1 [16] -1/-1/-1->0->-1 [17] -1/-1/-1->0->-1 [18] -1/-1/-1->0->-1 [19] -1/-1/-1->0->-1 [20] -1/-1/-1->0->-1 [21] -1/-1/-1->0->-1 [22] -1/-1/-1->0->-1 [23] -1/-1/-1->0->-1 [24] -1/-1/-1->0->-1 [25] -1/-1/-1->0->-1 [26] -1/-1/-1->0->-1 [27] -1/-1/-1->0->-1 [28] -1/-1/-1->0->-1 [29] -1/-1/-1->0->-1 [30] -1/-1/-1->0->-1 [31] -1/-1/-1->0->-1 [32] -1/-1/-1->0->-1 [33] -1/-1/-1->0->-1 [34] -1/-1/-1->0->-1 [35] -1/-1/-1->0->-1 [36] -1/-1/-1->0->-1 [37] -1/-1/-1->0->-1 [38] -1/-1/-1->0->-1 [39] -1/-1/-1->0->-1 [40] -1/-1/-1->0->-1 [41] -1/-1/-1->0->-1 [42] -1/-1/-1->0->-1 [43] -1/-1/-1->0->-1 [44] -1/-1/-1->0->-1 [45] -1/-1/-1->0->-1 [46] -1/-1/-1->0->-1
+lshn-qs-g2ri-2:96975:98281 [0] NCCL INFO P2P Chunksize set to 524288
+lshn-qs-g2ri-2:96975:98281 [0] NCCL INFO Check P2P Type isAllDirectP2p 0 directMode 0
+lshn-qs-g2ri-2:96975:98291 [0] NCCL INFO [Proxy Service] Device 0 CPU core 12
+lshn-qs-g2ri-2:96975:98292 [0] NCCL INFO [Proxy Service UDS] Device 0 CPU core 124
+lshn-qs-g2ri-2:96975:98281 [0] NCCL INFO 64 coll channels, 64 collnet channels, 0 nvls channels, 64 p2p channels, 64 p2p channels per peer
+lshn-qs-g2ri-2:96975:98281 [0] NCCL INFO CC Off, workFifoBytes 1048576
+lshn-qs-g2ri-2:96975:98281 [0] NCCL INFO ncclCommSplit comm 0x20e5a6a0 rank 0 nranks 1 cudaDev 0 nvmlDev 0 busId 8000 parent 0x1df75b30 splitCount 9 color 59908776 key 0 - Init COMPLETE
+lshn-qs-g2ri-2:96975:98281 [0] NCCL INFO Init timings - ncclCommSplit: rank 0 nranks 1 total 0.10 (kernels 0.00, alloc 0.00, bootstrap 0.00, allgathers 0.00, topo 0.01, graphs 0.00, connections 0.02, rest 0.07)
+[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
+lshn-qs-g2ri-2:96975:96975 [0] NCCL INFO Rank 0 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-g2ri-2:96975:96975 [0] NCCL INFO Rank 0 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-g2ri-2:96978:96978 [3] NCCL INFO Rank 3 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-g2ri-2:96977:96977 [2] NCCL INFO Rank 2 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-g2ri-2:96976:96976 [1] NCCL INFO Comm config Blocking set to 1
+lshn-qs-g2ri-2:96976:98302 [1] NCCL INFO Assigned NET plugin Socket to comm
+lshn-qs-g2ri-2:96975:96975 [0] NCCL INFO Rank 0 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-g2ri-2:96978:96978 [3] NCCL INFO Rank 3 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-g2ri-2:96977:96977 [2] NCCL INFO Rank 2 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-g2ri-2:96976:98302 [1] NCCL INFO Using network Socket
+lshn-qs-g2ri-2:96976:98302 [1] NCCL INFO ncclCommSplit comm 0x204f15c0 rank 0 nranks 1 cudaDev 1 nvmlDev 1 busId 7e000 parent 0x1dbf3b00 splitCount 11 color 440515407 key 0- Init START
+lshn-qs-g2ri-2:96976:98302 [1] NCCL INFO MNNVL busId 0x7e000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0
+lshn-qs-g2ri-2:96976:98302 [1] NCCL INFO Setting affinity for GPU 1 to 0-47,96-143
+lshn-qs-g2ri-2:96976:98302 [1] NCCL INFO comm 0x204f15c0 rank 0 nRanks 1 nNodes 1 localRanks 1 localRank 0 MNNVL 0
+lshn-qs-g2ri-2:96976:98302 [1] NCCL INFO Channel 00/64 : 0
+lshn-qs-g2ri-2:96976:98302 [1] NCCL INFO Channel 01/64 : 0
+lshn-qs-g2ri-2:96976:98302 [1] NCCL INFO Channel 02/64 : 0
+lshn-qs-g2ri-2:96976:98302 [1] NCCL INFO Channel 03/64 : 0
+lshn-qs-g2ri-2:96976:98302 [1] NCCL INFO Channel 04/64 : 0
+lshn-qs-g2ri-2:96976:98302 [1] NCCL INFO Channel 05/64 : 0
+lshn-qs-g2ri-2:96976:98302 [1] NCCL INFO Channel 06/64 : 0
+lshn-qs-g2ri-2:96976:98302 [1] NCCL INFO Channel 07/64 : 0
+lshn-qs-g2ri-2:96976:98302 [1] NCCL INFO Channel 08/64 : 0
+lshn-qs-g2ri-2:96976:98302 [1] NCCL INFO Channel 09/64 : 0
+lshn-qs-g2ri-2:96976:98302 [1] NCCL INFO Channel 10/64 : 0
+lshn-qs-g2ri-2:96976:98302 [1] NCCL INFO Channel 11/64 : 0
+lshn-qs-g2ri-2:96976:98302 [1] NCCL INFO Channel 12/64 : 0
+lshn-qs-g2ri-2:96976:98302 [1] NCCL INFO Channel 13/64 : 0
+lshn-qs-g2ri-2:96976:98302 [1] NCCL INFO Channel 14/64 : 0
+lshn-qs-g2ri-2:96976:98302 [1] NCCL INFO Channel 15/64 : 0
+lshn-qs-g2ri-2:96976:98302 [1] NCCL INFO Channel 16/64 : 0
+lshn-qs-g2ri-2:96976:98302 [1] NCCL INFO Channel 17/64 : 0
+lshn-qs-g2ri-2:96976:98302 [1] NCCL INFO Channel 18/64 : 0
+lshn-qs-g2ri-2:96976:98302 [1] NCCL INFO Channel 19/64 : 0
+lshn-qs-g2ri-2:96976:98302 [1] NCCL INFO Channel 20/64 : 0
+lshn-qs-g2ri-2:96976:98302 [1] NCCL INFO Channel 21/64 : 0
+lshn-qs-g2ri-2:96976:98302 [1] NCCL INFO Channel 22/64 : 0
+lshn-qs-g2ri-2:96976:98302 [1] NCCL INFO Channel 23/64 : 0
+lshn-qs-g2ri-2:96976:98302 [1] NCCL INFO Channel 24/64 : 0
+lshn-qs-g2ri-2:96976:98302 [1] NCCL INFO Channel 25/64 : 0
+lshn-qs-g2ri-2:96976:98302 [1] NCCL INFO Channel 26/64 : 0
+lshn-qs-g2ri-2:96976:98302 [1] NCCL INFO Channel 27/64 : 0
+lshn-qs-g2ri-2:96976:98302 [1] NCCL INFO Channel 28/64 : 0
+lshn-qs-g2ri-2:96976:98302 [1] NCCL INFO Channel 29/64 : 0
+lshn-qs-g2ri-2:96976:98302 [1] NCCL INFO Channel 30/64 : 0
+lshn-qs-g2ri-2:96976:98302 [1] NCCL INFO Channel 31/64 : 0
+lshn-qs-g2ri-2:96976:98302 [1] NCCL INFO Channel 32/64 : 0
+lshn-qs-g2ri-2:96976:98302 [1] NCCL INFO Channel 33/64 : 0
+lshn-qs-g2ri-2:96976:98302 [1] NCCL INFO Channel 34/64 : 0
+lshn-qs-g2ri-2:96976:98302 [1] NCCL INFO Channel 35/64 : 0
+lshn-qs-g2ri-2:96976:98302 [1] NCCL INFO Channel 36/64 : 0
+lshn-qs-g2ri-2:96976:98302 [1] NCCL INFO Channel 37/64 : 0
+lshn-qs-g2ri-2:96976:98302 [1] NCCL INFO Channel 38/64 : 0
+lshn-qs-g2ri-2:96976:98302 [1] NCCL INFO Channel 39/64 : 0
+lshn-qs-g2ri-2:96976:98302 [1] NCCL INFO Channel 40/64 : 0
+lshn-qs-g2ri-2:96976:98302 [1] NCCL INFO Channel 41/64 : 0
+lshn-qs-g2ri-2:96976:98302 [1] NCCL INFO Channel 42/64 : 0
+lshn-qs-g2ri-2:96976:98302 [1] NCCL INFO Channel 43/64 : 0
+lshn-qs-g2ri-2:96976:98302 [1] NCCL INFO Channel 44/64 : 0
+lshn-qs-g2ri-2:96976:98302 [1] NCCL INFO Channel 45/64 : 0
+lshn-qs-g2ri-2:96976:98302 [1] NCCL INFO Channel 46/64 : 0
+lshn-qs-g2ri-2:96976:98302 [1] NCCL INFO Channel 47/64 : 0
+lshn-qs-g2ri-2:96976:98302 [1] NCCL INFO Channel 48/64 : 0
+lshn-qs-g2ri-2:96976:98302 [1] NCCL INFO Channel 49/64 : 0
+lshn-qs-g2ri-2:96976:98302 [1] NCCL INFO Channel 50/64 : 0
+lshn-qs-g2ri-2:96976:98302 [1] NCCL INFO Channel 51/64 : 0
+lshn-qs-g2ri-2:96976:98302 [1] NCCL INFO Channel 52/64 : 0
+lshn-qs-g2ri-2:96976:98302 [1] NCCL INFO Channel 53/64 : 0
+lshn-qs-g2ri-2:96976:98302 [1] NCCL INFO Channel 54/64 : 0
+lshn-qs-g2ri-2:96976:98302 [1] NCCL INFO Channel 55/64 : 0
+lshn-qs-g2ri-2:96976:98302 [1] NCCL INFO Channel 56/64 : 0
+lshn-qs-g2ri-2:96976:98302 [1] NCCL INFO Channel 57/64 : 0
+lshn-qs-g2ri-2:96976:98302 [1] NCCL INFO Channel 58/64 : 0
+lshn-qs-g2ri-2:96976:98302 [1] NCCL INFO Channel 59/64 : 0
+lshn-qs-g2ri-2:96976:98302 [1] NCCL INFO Channel 60/64 : 0
+lshn-qs-g2ri-2:96976:98302 [1] NCCL INFO Channel 61/64 : 0
+lshn-qs-g2ri-2:96976:98302 [1] NCCL INFO Channel 62/64 : 0
+lshn-qs-g2ri-2:96976:98302 [1] NCCL INFO Channel 63/64 : 0
+lshn-qs-g2ri-2:96976:98302 [1] NCCL INFO Trees [0] -1/-1/-1->0->-1 [1] -1/-1/-1->0->-1 [2] -1/-1/-1->0->-1 [3] -1/-1/-1->0->-1 [4] -1/-1/-1->0->-1 [5] -1/-1/-1->0->-1 [6] -1/-1/-1->0->-1 [7] -1/-1/-1->0->-1 [8] -1/-1/-1->0->-1 [9] -1/-1/-1->0->-1 [10] -1/-1/-1->0->-1 [11] -1/-1/-1->0->-1 [12] -1/-1/-1->0->-1 [13] -1/-1/-1->0->-1 [14] -1/-1/-1->0->-1 [15] -1/-1/-1->0->-1 [16] -1/-1/-1->0->-1 [17] -1/-1/-1->0->-1 [18] -1/-1/-1->0->-1 [19] -1/-1/-1->0->-1 [20] -1/-1/-1->0->-1 [21] -1/-1/-1->0->-1 [22] -1/-1/-1->0->-1 [23] -1/-1/-1->0->-1 [24] -1/-1/-1->0->-1 [25] -1/-1/-1->0->-1 [26] -1/-1/-1->0->-1 [27] -1/-1/-1->0->-1 [28] -1/-1/-1->0->-1 [29] -1/-1/-1->0->-1 [30] -1/-1/-1->0->-1 [31] -1/-1/-1->0->-1 [32] -1/-1/-1->0->-1 [33] -1/-1/-1->0->-1 [34] -1/-1/-1->0->-1 [35] -1/-1/-1->0->-1 [36] -1/-1/-1->0->-1 [37] -1/-1/-1->0->-1 [38] -1/-1/-1->0->-1 [39] -1/-1/-1->0->-1 [40] -1/-1/-1->0->-1 [41] -1/-1/-1->0->-1 [42] -1/-1/-1->0->-1 [43] -1/-1/-1->0->-1 [44] -1/-1/-1->0->-1 [45] -1/-1/-1->0->-1 [46] -1/-1/-1->0->-1
+lshn-qs-g2ri-2:96976:98302 [1] NCCL INFO P2P Chunksize set to 524288
+lshn-qs-g2ri-2:96976:98302 [1] NCCL INFO Check P2P Type isAllDirectP2p 0 directMode 0
+lshn-qs-g2ri-2:96976:98306 [1] NCCL INFO [Proxy Service] Device 1 CPU core 14
+lshn-qs-g2ri-2:96976:98307 [1] NCCL INFO [Proxy Service UDS] Device 1 CPU core 118
+lshn-qs-g2ri-2:96976:98302 [1] NCCL INFO 64 coll channels, 64 collnet channels, 0 nvls channels, 64 p2p channels, 64 p2p channels per peer
+lshn-qs-g2ri-2:96976:98302 [1] NCCL INFO CC Off, workFifoBytes 1048576
+lshn-qs-g2ri-2:96976:98302 [1] NCCL INFO ncclCommSplit comm 0x204f15c0 rank 0 nranks 1 cudaDev 1 nvmlDev 1 busId 7e000 parent 0x1dbf3b00 splitCount 11 color 440515407 key 0 - Init COMPLETE
+lshn-qs-g2ri-2:96976:98302 [1] NCCL INFO Init timings - ncclCommSplit: rank 0 nranks 1 total 0.03 (kernels 0.00, alloc 0.00, bootstrap 0.00, allgathers 0.00, topo 0.01, graphs 0.00, connections 0.02, rest 0.00)
+[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
+lshn-qs-g2ri-2:96976:96976 [1] NCCL INFO Rank 1 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-g2ri-2:96978:96978 [3] NCCL INFO Rank 3 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-g2ri-2:96976:96976 [1] NCCL INFO Rank 1 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-g2ri-2:96975:96975 [0] NCCL INFO Rank 0 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-g2ri-2:96977:96977 [2] NCCL INFO Comm config Blocking set to 1
+lshn-qs-g2ri-2:96977:98317 [2] NCCL INFO Assigned NET plugin Socket to comm
+lshn-qs-g2ri-2:96978:96978 [3] NCCL INFO Rank 3 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-g2ri-2:96976:96976 [1] NCCL INFO Rank 1 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-g2ri-2:96975:96975 [0] NCCL INFO Rank 0 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-g2ri-2:96977:98317 [2] NCCL INFO Using network Socket
+lshn-qs-g2ri-2:96977:98317 [2] NCCL INFO ncclCommSplit comm 0x21b6ed10 rank 0 nranks 1 cudaDev 2 nvmlDev 2 busId a2000 parent 0x1f1d4810 splitCount 13 color 1227022723 key 0- Init START
+lshn-qs-g2ri-2:96977:98317 [2] NCCL INFO MNNVL busId 0xa2000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0
+lshn-qs-g2ri-2:96977:98317 [2] NCCL INFO Setting affinity for GPU 2 to 0-47,96-143
+lshn-qs-g2ri-2:96977:98317 [2] NCCL INFO comm 0x21b6ed10 rank 0 nRanks 1 nNodes 1 localRanks 1 localRank 0 MNNVL 0
+lshn-qs-g2ri-2:96977:98317 [2] NCCL INFO Channel 00/64 : 0
+lshn-qs-g2ri-2:96977:98317 [2] NCCL INFO Channel 01/64 : 0
+lshn-qs-g2ri-2:96977:98317 [2] NCCL INFO Channel 02/64 : 0
+lshn-qs-g2ri-2:96977:98317 [2] NCCL INFO Channel 03/64 : 0
+lshn-qs-g2ri-2:96977:98317 [2] NCCL INFO Channel 04/64 : 0
+lshn-qs-g2ri-2:96977:98317 [2] NCCL INFO Channel 05/64 : 0
+lshn-qs-g2ri-2:96977:98317 [2] NCCL INFO Channel 06/64 : 0
+lshn-qs-g2ri-2:96977:98317 [2] NCCL INFO Channel 07/64 : 0
+lshn-qs-g2ri-2:96977:98317 [2] NCCL INFO Channel 08/64 : 0
+lshn-qs-g2ri-2:96977:98317 [2] NCCL INFO Channel 09/64 : 0
+lshn-qs-g2ri-2:96977:98317 [2] NCCL INFO Channel 10/64 : 0
+lshn-qs-g2ri-2:96977:98317 [2] NCCL INFO Channel 11/64 : 0
+lshn-qs-g2ri-2:96977:98317 [2] NCCL INFO Channel 12/64 : 0
+lshn-qs-g2ri-2:96977:98317 [2] NCCL INFO Channel 13/64 : 0
+lshn-qs-g2ri-2:96977:98317 [2] NCCL INFO Channel 14/64 : 0
+lshn-qs-g2ri-2:96977:98317 [2] NCCL INFO Channel 15/64 : 0
+lshn-qs-g2ri-2:96977:98317 [2] NCCL INFO Channel 16/64 : 0
+lshn-qs-g2ri-2:96977:98317 [2] NCCL INFO Channel 17/64 : 0
+lshn-qs-g2ri-2:96977:98317 [2] NCCL INFO Channel 18/64 : 0
+lshn-qs-g2ri-2:96977:98317 [2] NCCL INFO Channel 19/64 : 0
+lshn-qs-g2ri-2:96977:98317 [2] NCCL INFO Channel 20/64 : 0
+lshn-qs-g2ri-2:96977:98317 [2] NCCL INFO Channel 21/64 : 0
+lshn-qs-g2ri-2:96977:98317 [2] NCCL INFO Channel 22/64 : 0
+lshn-qs-g2ri-2:96977:98317 [2] NCCL INFO Channel 23/64 : 0
+lshn-qs-g2ri-2:96977:98317 [2] NCCL INFO Channel 24/64 : 0
+lshn-qs-g2ri-2:96977:98317 [2] NCCL INFO Channel 25/64 : 0
+lshn-qs-g2ri-2:96977:98317 [2] NCCL INFO Channel 26/64 : 0
+lshn-qs-g2ri-2:96977:98317 [2] NCCL INFO Channel 27/64 : 0
+lshn-qs-g2ri-2:96977:98317 [2] NCCL INFO Channel 28/64 : 0
+lshn-qs-g2ri-2:96977:98317 [2] NCCL INFO Channel 29/64 : 0
+lshn-qs-g2ri-2:96977:98317 [2] NCCL INFO Channel 30/64 : 0
+lshn-qs-g2ri-2:96977:98317 [2] NCCL INFO Channel 31/64 : 0
+lshn-qs-g2ri-2:96977:98317 [2] NCCL INFO Channel 32/64 : 0
+lshn-qs-g2ri-2:96977:98317 [2] NCCL INFO Channel 33/64 : 0
+lshn-qs-g2ri-2:96977:98317 [2] NCCL INFO Channel 34/64 : 0
+lshn-qs-g2ri-2:96977:98317 [2] NCCL INFO Channel 35/64 : 0
+lshn-qs-g2ri-2:96977:98317 [2] NCCL INFO Channel 36/64 : 0
+lshn-qs-g2ri-2:96977:98317 [2] NCCL INFO Channel 37/64 : 0
+lshn-qs-g2ri-2:96977:98317 [2] NCCL INFO Channel 38/64 : 0
+lshn-qs-g2ri-2:96977:98317 [2] NCCL INFO Channel 39/64 : 0
+lshn-qs-g2ri-2:96977:98317 [2] NCCL INFO Channel 40/64 : 0
+lshn-qs-g2ri-2:96977:98317 [2] NCCL INFO Channel 41/64 : 0
+lshn-qs-g2ri-2:96977:98317 [2] NCCL INFO Channel 42/64 : 0
+lshn-qs-g2ri-2:96977:98317 [2] NCCL INFO Channel 43/64 : 0
+lshn-qs-g2ri-2:96977:98317 [2] NCCL INFO Channel 44/64 : 0
+lshn-qs-g2ri-2:96977:98317 [2] NCCL INFO Channel 45/64 : 0
+lshn-qs-g2ri-2:96977:98317 [2] NCCL INFO Channel 46/64 : 0
+lshn-qs-g2ri-2:96977:98317 [2] NCCL INFO Channel 47/64 : 0
+lshn-qs-g2ri-2:96977:98317 [2] NCCL INFO Channel 48/64 : 0
+lshn-qs-g2ri-2:96977:98317 [2] NCCL INFO Channel 49/64 : 0
+lshn-qs-g2ri-2:96977:98317 [2] NCCL INFO Channel 50/64 : 0
+lshn-qs-g2ri-2:96977:98317 [2] NCCL INFO Channel 51/64 : 0
+lshn-qs-g2ri-2:96977:98317 [2] NCCL INFO Channel 52/64 : 0
+lshn-qs-g2ri-2:96977:98317 [2] NCCL INFO Channel 53/64 : 0
+lshn-qs-g2ri-2:96977:98317 [2] NCCL INFO Channel 54/64 : 0
+lshn-qs-g2ri-2:96977:98317 [2] NCCL INFO Channel 55/64 : 0
+lshn-qs-g2ri-2:96977:98317 [2] NCCL INFO Channel 56/64 : 0
+lshn-qs-g2ri-2:96977:98317 [2] NCCL INFO Channel 57/64 : 0
+lshn-qs-g2ri-2:96977:98317 [2] NCCL INFO Channel 58/64 : 0
+lshn-qs-g2ri-2:96977:98317 [2] NCCL INFO Channel 59/64 : 0
+lshn-qs-g2ri-2:96977:98317 [2] NCCL INFO Channel 60/64 : 0
+lshn-qs-g2ri-2:96977:98317 [2] NCCL INFO Channel 61/64 : 0
+lshn-qs-g2ri-2:96977:98317 [2] NCCL INFO Channel 62/64 : 0
+lshn-qs-g2ri-2:96977:98317 [2] NCCL INFO Channel 63/64 : 0
+lshn-qs-g2ri-2:96977:98317 [2] NCCL INFO Trees [0] -1/-1/-1->0->-1 [1] -1/-1/-1->0->-1 [2] -1/-1/-1->0->-1 [3] -1/-1/-1->0->-1 [4] -1/-1/-1->0->-1 [5] -1/-1/-1->0->-1 [6] -1/-1/-1->0->-1 [7] -1/-1/-1->0->-1 [8] -1/-1/-1->0->-1 [9] -1/-1/-1->0->-1 [10] -1/-1/-1->0->-1 [11] -1/-1/-1->0->-1 [12] -1/-1/-1->0->-1 [13] -1/-1/-1->0->-1 [14] -1/-1/-1->0->-1 [15] -1/-1/-1->0->-1 [16] -1/-1/-1->0->-1 [17] -1/-1/-1->0->-1 [18] -1/-1/-1->0->-1 [19] -1/-1/-1->0->-1 [20] -1/-1/-1->0->-1 [21] -1/-1/-1->0->-1 [22] -1/-1/-1->0->-1 [23] -1/-1/-1->0->-1 [24] -1/-1/-1->0->-1 [25] -1/-1/-1->0->-1 [26] -1/-1/-1->0->-1 [27] -1/-1/-1->0->-1 [28] -1/-1/-1->0->-1 [29] -1/-1/-1->0->-1 [30] -1/-1/-1->0->-1 [31] -1/-1/-1->0->-1 [32] -1/-1/-1->0->-1 [33] -1/-1/-1->0->-1 [34] -1/-1/-1->0->-1 [35] -1/-1/-1->0->-1 [36] -1/-1/-1->0->-1 [37] -1/-1/-1->0->-1 [38] -1/-1/-1->0->-1 [39] -1/-1/-1->0->-1 [40] -1/-1/-1->0->-1 [41] -1/-1/-1->0->-1 [42] -1/-1/-1->0->-1 [43] -1/-1/-1->0->-1 [44] -1/-1/-1->0->-1 [45] -1/-1/-1->0->-1 [46] -1/-1/-1->0->-1
+lshn-qs-g2ri-2:96977:98317 [2] NCCL INFO P2P Chunksize set to 524288
+lshn-qs-g2ri-2:96977:98317 [2] NCCL INFO Check P2P Type isAllDirectP2p 0 directMode 0
+lshn-qs-g2ri-2:96977:98321 [2] NCCL INFO [Proxy Service] Device 2 CPU core 104
+lshn-qs-g2ri-2:96977:98322 [2] NCCL INFO [Proxy Service UDS] Device 2 CPU core 112
+lshn-qs-g2ri-2:96977:98317 [2] NCCL INFO 64 coll channels, 64 collnet channels, 0 nvls channels, 64 p2p channels, 64 p2p channels per peer
+lshn-qs-g2ri-2:96977:98317 [2] NCCL INFO CC Off, workFifoBytes 1048576
+lshn-qs-g2ri-2:96977:98317 [2] NCCL INFO ncclCommSplit comm 0x21b6ed10 rank 0 nranks 1 cudaDev 2 nvmlDev 2 busId a2000 parent 0x1f1d4810 splitCount 13 color 1227022723 key 0 - Init COMPLETE
+lshn-qs-g2ri-2:96977:98317 [2] NCCL INFO Init timings - ncclCommSplit: rank 0 nranks 1 total 0.05 (kernels 0.00, alloc 0.00, bootstrap 0.00, allgathers 0.00, topo 0.01, graphs 0.00, connections 0.04, rest 0.00)
+[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
+lshn-qs-g2ri-2:96977:96977 [2] NCCL INFO Rank 2 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-g2ri-2:96976:96976 [1] NCCL INFO Rank 1 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-g2ri-2:96977:96977 [2] NCCL INFO Rank 2 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-g2ri-2:96975:96975 [0] NCCL INFO Rank 0 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-g2ri-2:96978:96978 [3] NCCL INFO Comm config Blocking set to 1
+lshn-qs-g2ri-2:96978:98332 [3] NCCL INFO Assigned NET plugin Socket to comm
+lshn-qs-g2ri-2:96978:98332 [3] NCCL INFO Using network Socket
+lshn-qs-g2ri-2:96976:96976 [1] NCCL INFO Rank 1 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-g2ri-2:96978:98332 [3] NCCL INFO ncclCommSplit comm 0x1f515b90 rank 0 nranks 1 cudaDev 3 nvmlDev 3 busId c6000 parent 0x1e24ba00 splitCount 15 color 1301067556 key 0- Init START
+lshn-qs-g2ri-2:96977:96977 [2] NCCL INFO Rank 2 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-g2ri-2:96975:96975 [0] NCCL INFO Comm config Blocking set to 1
+lshn-qs-g2ri-2:96978:98332 [3] NCCL INFO MNNVL busId 0xc6000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0
+lshn-qs-g2ri-2:96978:98332 [3] NCCL INFO Setting affinity for GPU 3 to 0-47,96-143
+lshn-qs-g2ri-2:96978:98332 [3] NCCL INFO comm 0x1f515b90 rank 0 nRanks 1 nNodes 1 localRanks 1 localRank 0 MNNVL 0
+lshn-qs-g2ri-2:96978:98332 [3] NCCL INFO Channel 00/64 : 0
+lshn-qs-g2ri-2:96978:98332 [3] NCCL INFO Channel 01/64 : 0
+lshn-qs-g2ri-2:96978:98332 [3] NCCL INFO Channel 02/64 : 0
+lshn-qs-g2ri-2:96978:98332 [3] NCCL INFO Channel 03/64 : 0
+lshn-qs-g2ri-2:96978:98332 [3] NCCL INFO Channel 04/64 : 0
+lshn-qs-g2ri-2:96978:98332 [3] NCCL INFO Channel 05/64 : 0
+lshn-qs-g2ri-2:96978:98332 [3] NCCL INFO Channel 06/64 : 0
+lshn-qs-g2ri-2:96978:98332 [3] NCCL INFO Channel 07/64 : 0
+lshn-qs-g2ri-2:96978:98332 [3] NCCL INFO Channel 08/64 : 0
+lshn-qs-g2ri-2:96978:98332 [3] NCCL INFO Channel 09/64 : 0
+lshn-qs-g2ri-2:96978:98332 [3] NCCL INFO Channel 10/64 : 0
+lshn-qs-g2ri-2:96978:98332 [3] NCCL INFO Channel 11/64 : 0
+lshn-qs-g2ri-2:96978:98332 [3] NCCL INFO Channel 12/64 : 0
+lshn-qs-g2ri-2:96978:98332 [3] NCCL INFO Channel 13/64 : 0
+lshn-qs-g2ri-2:96978:98332 [3] NCCL INFO Channel 14/64 : 0
+lshn-qs-g2ri-2:96978:98332 [3] NCCL INFO Channel 15/64 : 0
+lshn-qs-g2ri-2:96978:98332 [3] NCCL INFO Channel 16/64 : 0
+lshn-qs-g2ri-2:96978:98332 [3] NCCL INFO Channel 17/64 : 0
+lshn-qs-g2ri-2:96978:98332 [3] NCCL INFO Channel 18/64 : 0
+lshn-qs-g2ri-2:96978:98332 [3] NCCL INFO Channel 19/64 : 0
+lshn-qs-g2ri-2:96978:98332 [3] NCCL INFO Channel 20/64 : 0
+lshn-qs-g2ri-2:96978:98332 [3] NCCL INFO Channel 21/64 : 0
+lshn-qs-g2ri-2:96978:98332 [3] NCCL INFO Channel 22/64 : 0
+lshn-qs-g2ri-2:96978:98332 [3] NCCL INFO Channel 23/64 : 0
+lshn-qs-g2ri-2:96978:98332 [3] NCCL INFO Channel 24/64 : 0
+lshn-qs-g2ri-2:96978:98332 [3] NCCL INFO Channel 25/64 : 0
+lshn-qs-g2ri-2:96978:98332 [3] NCCL INFO Channel 26/64 : 0
+lshn-qs-g2ri-2:96978:98332 [3] NCCL INFO Channel 27/64 : 0
+lshn-qs-g2ri-2:96978:98332 [3] NCCL INFO Channel 28/64 : 0
+lshn-qs-g2ri-2:96978:98332 [3] NCCL INFO Channel 29/64 : 0
+lshn-qs-g2ri-2:96978:98332 [3] NCCL INFO Channel 30/64 : 0
+lshn-qs-g2ri-2:96978:98332 [3] NCCL INFO Channel 31/64 : 0
+lshn-qs-g2ri-2:96978:98332 [3] NCCL INFO Channel 32/64 : 0
+lshn-qs-g2ri-2:96978:98332 [3] NCCL INFO Channel 33/64 : 0
+lshn-qs-g2ri-2:96978:98332 [3] NCCL INFO Channel 34/64 : 0
+lshn-qs-g2ri-2:96978:98332 [3] NCCL INFO Channel 35/64 : 0
+lshn-qs-g2ri-2:96978:98332 [3] NCCL INFO Channel 36/64 : 0
+lshn-qs-g2ri-2:96978:98332 [3] NCCL INFO Channel 37/64 : 0
+lshn-qs-g2ri-2:96978:98332 [3] NCCL INFO Channel 38/64 : 0
+lshn-qs-g2ri-2:96978:98332 [3] NCCL INFO Channel 39/64 : 0
+lshn-qs-g2ri-2:96978:98332 [3] NCCL INFO Channel 40/64 : 0
+lshn-qs-g2ri-2:96978:98332 [3] NCCL INFO Channel 41/64 : 0
+lshn-qs-g2ri-2:96978:98332 [3] NCCL INFO Channel 42/64 : 0
+lshn-qs-g2ri-2:96978:98332 [3] NCCL INFO Channel 43/64 : 0
+lshn-qs-g2ri-2:96978:98332 [3] NCCL INFO Channel 44/64 : 0
+lshn-qs-g2ri-2:96978:98332 [3] NCCL INFO Channel 45/64 : 0
+lshn-qs-g2ri-2:96978:98332 [3] NCCL INFO Channel 46/64 : 0
+lshn-qs-g2ri-2:96978:98332 [3] NCCL INFO Channel 47/64 : 0
+lshn-qs-g2ri-2:96978:98332 [3] NCCL INFO Channel 48/64 : 0
+lshn-qs-g2ri-2:96978:98332 [3] NCCL INFO Channel 49/64 : 0
+lshn-qs-g2ri-2:96978:98332 [3] NCCL INFO Channel 50/64 : 0
+lshn-qs-g2ri-2:96978:98332 [3] NCCL INFO Channel 51/64 : 0
+lshn-qs-g2ri-2:96978:98332 [3] NCCL INFO Channel 52/64 : 0
+lshn-qs-g2ri-2:96978:98332 [3] NCCL INFO Channel 53/64 : 0
+lshn-qs-g2ri-2:96978:98332 [3] NCCL INFO Channel 54/64 : 0
+lshn-qs-g2ri-2:96978:98332 [3] NCCL INFO Channel 55/64 : 0
+lshn-qs-g2ri-2:96978:98332 [3] NCCL INFO Channel 56/64 : 0
+lshn-qs-g2ri-2:96978:98332 [3] NCCL INFO Channel 57/64 : 0
+lshn-qs-g2ri-2:96978:98332 [3] NCCL INFO Channel 58/64 : 0
+lshn-qs-g2ri-2:96978:98332 [3] NCCL INFO Channel 59/64 : 0
+lshn-qs-g2ri-2:96978:98332 [3] NCCL INFO Channel 60/64 : 0
+lshn-qs-g2ri-2:96978:98332 [3] NCCL INFO Channel 61/64 : 0
+lshn-qs-g2ri-2:96978:98332 [3] NCCL INFO Channel 62/64 : 0
+lshn-qs-g2ri-2:96978:98332 [3] NCCL INFO Channel 63/64 : 0
+lshn-qs-g2ri-2:96978:98332 [3] NCCL INFO Trees [0] -1/-1/-1->0->-1 [1] -1/-1/-1->0->-1 [2] -1/-1/-1->0->-1 [3] -1/-1/-1->0->-1 [4] -1/-1/-1->0->-1 [5] -1/-1/-1->0->-1 [6] -1/-1/-1->0->-1 [7] -1/-1/-1->0->-1 [8] -1/-1/-1->0->-1 [9] -1/-1/-1->0->-1 [10] -1/-1/-1->0->-1 [11] -1/-1/-1->0->-1 [12] -1/-1/-1->0->-1 [13] -1/-1/-1->0->-1 [14] -1/-1/-1->0->-1 [15] -1/-1/-1->0->-1 [16] -1/-1/-1->0->-1 [17] -1/-1/-1->0->-1 [18] -1/-1/-1->0->-1 [19] -1/-1/-1->0->-1 [20] -1/-1/-1->0->-1 [21] -1/-1/-1->0->-1 [22] -1/-1/-1->0->-1 [23] -1/-1/-1->0->-1 [24] -1/-1/-1->0->-1 [25] -1/-1/-1->0->-1 [26] -1/-1/-1->0->-1 [27] -1/-1/-1->0->-1 [28] -1/-1/-1->0->-1 [29] -1/-1/-1->0->-1 [30] -1/-1/-1->0->-1 [31] -1/-1/-1->0->-1 [32] -1/-1/-1->0->-1 [33] -1/-1/-1->0->-1 [34] -1/-1/-1->0->-1 [35] -1/-1/-1->0->-1 [36] -1/-1/-1->0->-1 [37] -1/-1/-1->0->-1 [38] -1/-1/-1->0->-1 [39] -1/-1/-1->0->-1 [40] -1/-1/-1->0->-1 [41] -1/-1/-1->0->-1 [42] -1/-1/-1->0->-1 [43] -1/-1/-1->0->-1 [44] -1/-1/-1->0->-1 [45] -1/-1/-1->0->-1 [46] -1/-1/-1->0->-1
+lshn-qs-g2ri-2:96978:98332 [3] NCCL INFO P2P Chunksize set to 524288
+lshn-qs-g2ri-2:96978:98332 [3] NCCL INFO Check P2P Type isAllDirectP2p 0 directMode 0
+lshn-qs-g2ri-2:96978:98338 [3] NCCL INFO [Proxy Service] Device 3 CPU core 41
+lshn-qs-g2ri-2:96978:98339 [3] NCCL INFO [Proxy Service UDS] Device 3 CPU core 27
+lshn-qs-g2ri-2:96978:98332 [3] NCCL INFO 64 coll channels, 64 collnet channels, 0 nvls channels, 64 p2p channels, 64 p2p channels per peer
+lshn-qs-g2ri-2:96978:98332 [3] NCCL INFO CC Off, workFifoBytes 1048576
+lshn-qs-g2ri-2:96978:98332 [3] NCCL INFO ncclCommSplit comm 0x1f515b90 rank 0 nranks 1 cudaDev 3 nvmlDev 3 busId c6000 parent 0x1e24ba00 splitCount 15 color 1301067556 key 0 - Init COMPLETE
+lshn-qs-g2ri-2:96978:98332 [3] NCCL INFO Init timings - ncclCommSplit: rank 0 nranks 1 total 0.03 (kernels 0.00, alloc 0.00, bootstrap 0.00, allgathers 0.00, topo 0.01, graphs 0.00, connections 0.02, rest 0.00)
+[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
+lshn-qs-g2ri-2:96978:96978 [3] NCCL INFO Rank 3 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-g2ri-2:96975:98337 [0] NCCL INFO Assigned NET plugin Socket to comm
+lshn-qs-g2ri-2:96975:98337 [0] NCCL INFO Using network Socket
+lshn-qs-g2ri-2:96978:96978 [3] NCCL INFO Rank 3 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-g2ri-2:96976:96976 [1] NCCL INFO Rank 1 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-g2ri-2:96977:96977 [2] NCCL INFO Rank 2 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-g2ri-2:96975:98337 [0] NCCL INFO ncclCommSplit comm 0x20f622b0 rank 0 nranks 1 cudaDev 0 nvmlDev 0 busId 8000 parent 0x1df75b30 splitCount 16 color 59908776 key 0- Init START
+lshn-qs-g2ri-2:96975:98337 [0] NCCL INFO MNNVL busId 0x8000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0
+lshn-qs-g2ri-2:96975:98337 [0] NCCL INFO Setting affinity for GPU 0 to 0-47,96-143
+lshn-qs-g2ri-2:96975:98337 [0] NCCL INFO comm 0x20f622b0 rank 0 nRanks 1 nNodes 1 localRanks 1 localRank 0 MNNVL 0
+lshn-qs-g2ri-2:96975:98337 [0] NCCL INFO Channel 00/64 : 0
+lshn-qs-g2ri-2:96975:98337 [0] NCCL INFO Channel 01/64 : 0
+lshn-qs-g2ri-2:96975:98337 [0] NCCL INFO Channel 02/64 : 0
+lshn-qs-g2ri-2:96975:98337 [0] NCCL INFO Channel 03/64 : 0
+lshn-qs-g2ri-2:96975:98337 [0] NCCL INFO Channel 04/64 : 0
+lshn-qs-g2ri-2:96975:98337 [0] NCCL INFO Channel 05/64 : 0
+lshn-qs-g2ri-2:96975:98337 [0] NCCL INFO Channel 06/64 : 0
+lshn-qs-g2ri-2:96975:98337 [0] NCCL INFO Channel 07/64 : 0
+lshn-qs-g2ri-2:96975:98337 [0] NCCL INFO Channel 08/64 : 0
+lshn-qs-g2ri-2:96975:98337 [0] NCCL INFO Channel 09/64 : 0
+lshn-qs-g2ri-2:96975:98337 [0] NCCL INFO Channel 10/64 : 0
+lshn-qs-g2ri-2:96975:98337 [0] NCCL INFO Channel 11/64 : 0
+lshn-qs-g2ri-2:96975:98337 [0] NCCL INFO Channel 12/64 : 0
+lshn-qs-g2ri-2:96975:98337 [0] NCCL INFO Channel 13/64 : 0
+lshn-qs-g2ri-2:96975:98337 [0] NCCL INFO Channel 14/64 : 0
+lshn-qs-g2ri-2:96975:98337 [0] NCCL INFO Channel 15/64 : 0
+lshn-qs-g2ri-2:96975:98337 [0] NCCL INFO Channel 16/64 : 0
+lshn-qs-g2ri-2:96975:98337 [0] NCCL INFO Channel 17/64 : 0
+lshn-qs-g2ri-2:96975:98337 [0] NCCL INFO Channel 18/64 : 0
+lshn-qs-g2ri-2:96975:98337 [0] NCCL INFO Channel 19/64 : 0
+lshn-qs-g2ri-2:96975:98337 [0] NCCL INFO Channel 20/64 : 0
+lshn-qs-g2ri-2:96975:98337 [0] NCCL INFO Channel 21/64 : 0
+lshn-qs-g2ri-2:96975:98337 [0] NCCL INFO Channel 22/64 : 0
+lshn-qs-g2ri-2:96975:98337 [0] NCCL INFO Channel 23/64 : 0
+lshn-qs-g2ri-2:96975:98337 [0] NCCL INFO Channel 24/64 : 0
+lshn-qs-g2ri-2:96975:98337 [0] NCCL INFO Channel 25/64 : 0
+lshn-qs-g2ri-2:96975:98337 [0] NCCL INFO Channel 26/64 : 0
+lshn-qs-g2ri-2:96975:98337 [0] NCCL INFO Channel 27/64 : 0
+lshn-qs-g2ri-2:96975:98337 [0] NCCL INFO Channel 28/64 : 0
+lshn-qs-g2ri-2:96975:98337 [0] NCCL INFO Channel 29/64 : 0
+lshn-qs-g2ri-2:96975:98337 [0] NCCL INFO Channel 30/64 : 0
+lshn-qs-g2ri-2:96975:98337 [0] NCCL INFO Channel 31/64 : 0
+lshn-qs-g2ri-2:96975:98337 [0] NCCL INFO Channel 32/64 : 0
+lshn-qs-g2ri-2:96975:98337 [0] NCCL INFO Channel 33/64 : 0
+lshn-qs-g2ri-2:96975:98337 [0] NCCL INFO Channel 34/64 : 0
+lshn-qs-g2ri-2:96975:98337 [0] NCCL INFO Channel 35/64 : 0
+lshn-qs-g2ri-2:96975:98337 [0] NCCL INFO Channel 36/64 : 0
+lshn-qs-g2ri-2:96975:98337 [0] NCCL INFO Channel 37/64 : 0
+lshn-qs-g2ri-2:96975:98337 [0] NCCL INFO Channel 38/64 : 0
+lshn-qs-g2ri-2:96975:98337 [0] NCCL INFO Channel 39/64 : 0
+lshn-qs-g2ri-2:96975:98337 [0] NCCL INFO Channel 40/64 : 0
+lshn-qs-g2ri-2:96975:98337 [0] NCCL INFO Channel 41/64 : 0
+lshn-qs-g2ri-2:96975:98337 [0] NCCL INFO Channel 42/64 : 0
+lshn-qs-g2ri-2:96975:98337 [0] NCCL INFO Channel 43/64 : 0
+lshn-qs-g2ri-2:96975:98337 [0] NCCL INFO Channel 44/64 : 0
+lshn-qs-g2ri-2:96975:98337 [0] NCCL INFO Channel 45/64 : 0
+lshn-qs-g2ri-2:96975:98337 [0] NCCL INFO Channel 46/64 : 0
+lshn-qs-g2ri-2:96975:98337 [0] NCCL INFO Channel 47/64 : 0
+lshn-qs-g2ri-2:96975:98337 [0] NCCL INFO Channel 48/64 : 0
+lshn-qs-g2ri-2:96975:98337 [0] NCCL INFO Channel 49/64 : 0
+lshn-qs-g2ri-2:96975:98337 [0] NCCL INFO Channel 50/64 : 0
+lshn-qs-g2ri-2:96975:98337 [0] NCCL INFO Channel 51/64 : 0
+lshn-qs-g2ri-2:96975:98337 [0] NCCL INFO Channel 52/64 : 0
+lshn-qs-g2ri-2:96975:98337 [0] NCCL INFO Channel 53/64 : 0
+lshn-qs-g2ri-2:96975:98337 [0] NCCL INFO Channel 54/64 : 0
+lshn-qs-g2ri-2:96975:98337 [0] NCCL INFO Channel 55/64 : 0
+lshn-qs-g2ri-2:96975:98337 [0] NCCL INFO Channel 56/64 : 0
+lshn-qs-g2ri-2:96975:98337 [0] NCCL INFO Channel 57/64 : 0
+lshn-qs-g2ri-2:96975:98337 [0] NCCL INFO Channel 58/64 : 0
+lshn-qs-g2ri-2:96975:98337 [0] NCCL INFO Channel 59/64 : 0
+lshn-qs-g2ri-2:96975:98337 [0] NCCL INFO Channel 60/64 : 0
+lshn-qs-g2ri-2:96975:98337 [0] NCCL INFO Channel 61/64 : 0
+lshn-qs-g2ri-2:96975:98337 [0] NCCL INFO Channel 62/64 : 0
+lshn-qs-g2ri-2:96975:98337 [0] NCCL INFO Channel 63/64 : 0
+lshn-qs-g2ri-2:96975:98337 [0] NCCL INFO Trees [0] -1/-1/-1->0->-1 [1] -1/-1/-1->0->-1 [2] -1/-1/-1->0->-1 [3] -1/-1/-1->0->-1 [4] -1/-1/-1->0->-1 [5] -1/-1/-1->0->-1 [6] -1/-1/-1->0->-1 [7] -1/-1/-1->0->-1 [8] -1/-1/-1->0->-1 [9] -1/-1/-1->0->-1 [10] -1/-1/-1->0->-1 [11] -1/-1/-1->0->-1 [12] -1/-1/-1->0->-1 [13] -1/-1/-1->0->-1 [14] -1/-1/-1->0->-1 [15] -1/-1/-1->0->-1 [16] -1/-1/-1->0->-1 [17] -1/-1/-1->0->-1 [18] -1/-1/-1->0->-1 [19] -1/-1/-1->0->-1 [20] -1/-1/-1->0->-1 [21] -1/-1/-1->0->-1 [22] -1/-1/-1->0->-1 [23] -1/-1/-1->0->-1 [24] -1/-1/-1->0->-1 [25] -1/-1/-1->0->-1 [26] -1/-1/-1->0->-1 [27] -1/-1/-1->0->-1 [28] -1/-1/-1->0->-1 [29] -1/-1/-1->0->-1 [30] -1/-1/-1->0->-1 [31] -1/-1/-1->0->-1 [32] -1/-1/-1->0->-1 [33] -1/-1/-1->0->-1 [34] -1/-1/-1->0->-1 [35] -1/-1/-1->0->-1 [36] -1/-1/-1->0->-1 [37] -1/-1/-1->0->-1 [38] -1/-1/-1->0->-1 [39] -1/-1/-1->0->-1 [40] -1/-1/-1->0->-1 [41] -1/-1/-1->0->-1 [42] -1/-1/-1->0->-1 [43] -1/-1/-1->0->-1 [44] -1/-1/-1->0->-1 [45] -1/-1/-1->0->-1 [46] -1/-1/-1->0->-1
+lshn-qs-g2ri-2:96975:98337 [0] NCCL INFO P2P Chunksize set to 524288
+lshn-qs-g2ri-2:96975:98337 [0] NCCL INFO Check P2P Type isAllDirectP2p 0 directMode 0
+lshn-qs-g2ri-2:96975:98347 [0] NCCL INFO [Proxy Service] Device 0 CPU core 3
+lshn-qs-g2ri-2:96975:98348 [0] NCCL INFO [Proxy Service UDS] Device 0 CPU core 109
+lshn-qs-g2ri-2:96975:98337 [0] NCCL INFO 64 coll channels, 64 collnet channels, 0 nvls channels, 64 p2p channels, 64 p2p channels per peer
+lshn-qs-g2ri-2:96975:98337 [0] NCCL INFO CC Off, workFifoBytes 1048576
+lshn-qs-g2ri-2:96975:98337 [0] NCCL INFO ncclCommSplit comm 0x20f622b0 rank 0 nranks 1 cudaDev 0 nvmlDev 0 busId 8000 parent 0x1df75b30 splitCount 16 color 59908776 key 0 - Init COMPLETE
+lshn-qs-g2ri-2:96975:98337 [0] NCCL INFO Init timings - ncclCommSplit: rank 0 nranks 1 total 0.08 (kernels 0.00, alloc 0.00, bootstrap 0.00, allgathers 0.00, topo 0.01, graphs 0.00, connections 0.02, rest 0.04)
+[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
+lshn-qs-g2ri-2:96975:96975 [0] NCCL INFO Rank 0 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-g2ri-2:96975:96975 [0] NCCL INFO Rank 0 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-g2ri-2:96978:96978 [3] NCCL INFO Rank 3 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-g2ri-2:96977:96977 [2] NCCL INFO Rank 2 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-g2ri-2:96976:96976 [1] NCCL INFO Comm config Blocking set to 1
+lshn-qs-g2ri-2:96976:98358 [1] NCCL INFO Assigned NET plugin Socket to comm
+lshn-qs-g2ri-2:96976:98358 [1] NCCL INFO Using network Socket
+lshn-qs-g2ri-2:96978:96978 [3] NCCL INFO Rank 3 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-g2ri-2:96975:96975 [0] NCCL INFO Rank 0 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-g2ri-2:96977:96977 [2] NCCL INFO Rank 2 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-g2ri-2:96976:98358 [1] NCCL INFO ncclCommSplit comm 0x205f91d0 rank 0 nranks 1 cudaDev 1 nvmlDev 1 busId 7e000 parent 0x1dbf3b00 splitCount 18 color 440515407 key 0- Init START
+lshn-qs-g2ri-2:96976:98358 [1] NCCL INFO MNNVL busId 0x7e000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0
+lshn-qs-g2ri-2:96976:98358 [1] NCCL INFO Setting affinity for GPU 1 to 0-47,96-143
+lshn-qs-g2ri-2:96976:98358 [1] NCCL INFO comm 0x205f91d0 rank 0 nRanks 1 nNodes 1 localRanks 1 localRank 0 MNNVL 0
+lshn-qs-g2ri-2:96976:98358 [1] NCCL INFO Channel 00/64 : 0
+lshn-qs-g2ri-2:96976:98358 [1] NCCL INFO Channel 01/64 : 0
+lshn-qs-g2ri-2:96976:98358 [1] NCCL INFO Channel 02/64 : 0
+lshn-qs-g2ri-2:96976:98358 [1] NCCL INFO Channel 03/64 : 0
+lshn-qs-g2ri-2:96976:98358 [1] NCCL INFO Channel 04/64 : 0
+lshn-qs-g2ri-2:96976:98358 [1] NCCL INFO Channel 05/64 : 0
+lshn-qs-g2ri-2:96976:98358 [1] NCCL INFO Channel 06/64 : 0
+lshn-qs-g2ri-2:96976:98358 [1] NCCL INFO Channel 07/64 : 0
+lshn-qs-g2ri-2:96976:98358 [1] NCCL INFO Channel 08/64 : 0
+lshn-qs-g2ri-2:96976:98358 [1] NCCL INFO Channel 09/64 : 0
+lshn-qs-g2ri-2:96976:98358 [1] NCCL INFO Channel 10/64 : 0
+lshn-qs-g2ri-2:96976:98358 [1] NCCL INFO Channel 11/64 : 0
+lshn-qs-g2ri-2:96976:98358 [1] NCCL INFO Channel 12/64 : 0
+lshn-qs-g2ri-2:96976:98358 [1] NCCL INFO Channel 13/64 : 0
+lshn-qs-g2ri-2:96976:98358 [1] NCCL INFO Channel 14/64 : 0
+lshn-qs-g2ri-2:96976:98358 [1] NCCL INFO Channel 15/64 : 0
+lshn-qs-g2ri-2:96976:98358 [1] NCCL INFO Channel 16/64 : 0
+lshn-qs-g2ri-2:96976:98358 [1] NCCL INFO Channel 17/64 : 0
+lshn-qs-g2ri-2:96976:98358 [1] NCCL INFO Channel 18/64 : 0
+lshn-qs-g2ri-2:96976:98358 [1] NCCL INFO Channel 19/64 : 0
+lshn-qs-g2ri-2:96976:98358 [1] NCCL INFO Channel 20/64 : 0
+lshn-qs-g2ri-2:96976:98358 [1] NCCL INFO Channel 21/64 : 0
+lshn-qs-g2ri-2:96976:98358 [1] NCCL INFO Channel 22/64 : 0
+lshn-qs-g2ri-2:96976:98358 [1] NCCL INFO Channel 23/64 : 0
+lshn-qs-g2ri-2:96976:98358 [1] NCCL INFO Channel 24/64 : 0
+lshn-qs-g2ri-2:96976:98358 [1] NCCL INFO Channel 25/64 : 0
+lshn-qs-g2ri-2:96976:98358 [1] NCCL INFO Channel 26/64 : 0
+lshn-qs-g2ri-2:96976:98358 [1] NCCL INFO Channel 27/64 : 0
+lshn-qs-g2ri-2:96976:98358 [1] NCCL INFO Channel 28/64 : 0
+lshn-qs-g2ri-2:96976:98358 [1] NCCL INFO Channel 29/64 : 0
+lshn-qs-g2ri-2:96976:98358 [1] NCCL INFO Channel 30/64 : 0
+lshn-qs-g2ri-2:96976:98358 [1] NCCL INFO Channel 31/64 : 0
+lshn-qs-g2ri-2:96976:98358 [1] NCCL INFO Channel 32/64 : 0
+lshn-qs-g2ri-2:96976:98358 [1] NCCL INFO Channel 33/64 : 0
+lshn-qs-g2ri-2:96976:98358 [1] NCCL INFO Channel 34/64 : 0
+lshn-qs-g2ri-2:96976:98358 [1] NCCL INFO Channel 35/64 : 0
+lshn-qs-g2ri-2:96976:98358 [1] NCCL INFO Channel 36/64 : 0
+lshn-qs-g2ri-2:96976:98358 [1] NCCL INFO Channel 37/64 : 0
+lshn-qs-g2ri-2:96976:98358 [1] NCCL INFO Channel 38/64 : 0
+lshn-qs-g2ri-2:96976:98358 [1] NCCL INFO Channel 39/64 : 0
+lshn-qs-g2ri-2:96976:98358 [1] NCCL INFO Channel 40/64 : 0
+lshn-qs-g2ri-2:96976:98358 [1] NCCL INFO Channel 41/64 : 0
+lshn-qs-g2ri-2:96976:98358 [1] NCCL INFO Channel 42/64 : 0
+lshn-qs-g2ri-2:96976:98358 [1] NCCL INFO Channel 43/64 : 0
+lshn-qs-g2ri-2:96976:98358 [1] NCCL INFO Channel 44/64 : 0
+lshn-qs-g2ri-2:96976:98358 [1] NCCL INFO Channel 45/64 : 0
+lshn-qs-g2ri-2:96976:98358 [1] NCCL INFO Channel 46/64 : 0
+lshn-qs-g2ri-2:96976:98358 [1] NCCL INFO Channel 47/64 : 0
+lshn-qs-g2ri-2:96976:98358 [1] NCCL INFO Channel 48/64 : 0
+lshn-qs-g2ri-2:96976:98358 [1] NCCL INFO Channel 49/64 : 0
+lshn-qs-g2ri-2:96976:98358 [1] NCCL INFO Channel 50/64 : 0
+lshn-qs-g2ri-2:96976:98358 [1] NCCL INFO Channel 51/64 : 0
+lshn-qs-g2ri-2:96976:98358 [1] NCCL INFO Channel 52/64 : 0
+lshn-qs-g2ri-2:96976:98358 [1] NCCL INFO Channel 53/64 : 0
+lshn-qs-g2ri-2:96976:98358 [1] NCCL INFO Channel 54/64 : 0
+lshn-qs-g2ri-2:96976:98358 [1] NCCL INFO Channel 55/64 : 0
+lshn-qs-g2ri-2:96976:98358 [1] NCCL INFO Channel 56/64 : 0
+lshn-qs-g2ri-2:96976:98358 [1] NCCL INFO Channel 57/64 : 0
+lshn-qs-g2ri-2:96976:98358 [1] NCCL INFO Channel 58/64 : 0
+lshn-qs-g2ri-2:96976:98358 [1] NCCL INFO Channel 59/64 : 0
+lshn-qs-g2ri-2:96976:98358 [1] NCCL INFO Channel 60/64 : 0
+lshn-qs-g2ri-2:96976:98358 [1] NCCL INFO Channel 61/64 : 0
+lshn-qs-g2ri-2:96976:98358 [1] NCCL INFO Channel 62/64 : 0
+lshn-qs-g2ri-2:96976:98358 [1] NCCL INFO Channel 63/64 : 0
+lshn-qs-g2ri-2:96976:98358 [1] NCCL INFO Trees [0] -1/-1/-1->0->-1 [1] -1/-1/-1->0->-1 [2] -1/-1/-1->0->-1 [3] -1/-1/-1->0->-1 [4] -1/-1/-1->0->-1 [5] -1/-1/-1->0->-1 [6] -1/-1/-1->0->-1 [7] -1/-1/-1->0->-1 [8] -1/-1/-1->0->-1 [9] -1/-1/-1->0->-1 [10] -1/-1/-1->0->-1 [11] -1/-1/-1->0->-1 [12] -1/-1/-1->0->-1 [13] -1/-1/-1->0->-1 [14] -1/-1/-1->0->-1 [15] -1/-1/-1->0->-1 [16] -1/-1/-1->0->-1 [17] -1/-1/-1->0->-1 [18] -1/-1/-1->0->-1 [19] -1/-1/-1->0->-1 [20] -1/-1/-1->0->-1 [21] -1/-1/-1->0->-1 [22] -1/-1/-1->0->-1 [23] -1/-1/-1->0->-1 [24] -1/-1/-1->0->-1 [25] -1/-1/-1->0->-1 [26] -1/-1/-1->0->-1 [27] -1/-1/-1->0->-1 [28] -1/-1/-1->0->-1 [29] -1/-1/-1->0->-1 [30] -1/-1/-1->0->-1 [31] -1/-1/-1->0->-1 [32] -1/-1/-1->0->-1 [33] -1/-1/-1->0->-1 [34] -1/-1/-1->0->-1 [35] -1/-1/-1->0->-1 [36] -1/-1/-1->0->-1 [37] -1/-1/-1->0->-1 [38] -1/-1/-1->0->-1 [39] -1/-1/-1->0->-1 [40] -1/-1/-1->0->-1 [41] -1/-1/-1->0->-1 [42] -1/-1/-1->0->-1 [43] -1/-1/-1->0->-1 [44] -1/-1/-1->0->-1 [45] -1/-1/-1->0->-1 [46] -1/-1/-1->0->-1
+lshn-qs-g2ri-2:96976:98358 [1] NCCL INFO P2P Chunksize set to 524288
+lshn-qs-g2ri-2:96976:98358 [1] NCCL INFO Check P2P Type isAllDirectP2p 0 directMode 0
+lshn-qs-g2ri-2:96976:98362 [1] NCCL INFO [Proxy Service] Device 1 CPU core 7
+lshn-qs-g2ri-2:96976:98363 [1] NCCL INFO [Proxy Service UDS] Device 1 CPU core 115
+lshn-qs-g2ri-2:96976:98358 [1] NCCL INFO 64 coll channels, 64 collnet channels, 0 nvls channels, 64 p2p channels, 64 p2p channels per peer
+lshn-qs-g2ri-2:96976:98358 [1] NCCL INFO CC Off, workFifoBytes 1048576
+lshn-qs-g2ri-2:96976:98358 [1] NCCL INFO ncclCommSplit comm 0x205f91d0 rank 0 nranks 1 cudaDev 1 nvmlDev 1 busId 7e000 parent 0x1dbf3b00 splitCount 18 color 440515407 key 0 - Init COMPLETE
+lshn-qs-g2ri-2:96976:98358 [1] NCCL INFO Init timings - ncclCommSplit: rank 0 nranks 1 total 0.03 (kernels 0.00, alloc 0.00, bootstrap 0.00, allgathers 0.00, topo 0.01, graphs 0.00, connections 0.02, rest 0.00)
+[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
+lshn-qs-g2ri-2:96976:96976 [1] NCCL INFO Rank 1 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-g2ri-2:96976:96976 [1] NCCL INFO Rank 1 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-g2ri-2:96978:96978 [3] NCCL INFO Rank 3 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-g2ri-2:96975:96975 [0] NCCL INFO Rank 0 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-g2ri-2:96977:96977 [2] NCCL INFO Comm config Blocking set to 1
+lshn-qs-g2ri-2:96977:98373 [2] NCCL INFO Assigned NET plugin Socket to comm
+lshn-qs-g2ri-2:96975:96975 [0] NCCL INFO Rank 0 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-g2ri-2:96978:96978 [3] NCCL INFO Rank 3 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-g2ri-2:96976:96976 [1] NCCL INFO Rank 1 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-g2ri-2:96977:98373 [2] NCCL INFO Using network Socket
+lshn-qs-g2ri-2:96977:98373 [2] NCCL INFO ncclCommSplit comm 0x21c76920 rank 0 nranks 1 cudaDev 2 nvmlDev 2 busId a2000 parent 0x1f1d4810 splitCount 20 color 1227022723 key 0- Init START
+lshn-qs-g2ri-2:96977:98373 [2] NCCL INFO MNNVL busId 0xa2000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0
+lshn-qs-g2ri-2:96977:98373 [2] NCCL INFO Setting affinity for GPU 2 to 0-47,96-143
+lshn-qs-g2ri-2:96977:98373 [2] NCCL INFO comm 0x21c76920 rank 0 nRanks 1 nNodes 1 localRanks 1 localRank 0 MNNVL 0
+lshn-qs-g2ri-2:96977:98373 [2] NCCL INFO Channel 00/64 : 0
+lshn-qs-g2ri-2:96977:98373 [2] NCCL INFO Channel 01/64 : 0
+lshn-qs-g2ri-2:96977:98373 [2] NCCL INFO Channel 02/64 : 0
+lshn-qs-g2ri-2:96977:98373 [2] NCCL INFO Channel 03/64 : 0
+lshn-qs-g2ri-2:96977:98373 [2] NCCL INFO Channel 04/64 : 0
+lshn-qs-g2ri-2:96977:98373 [2] NCCL INFO Channel 05/64 : 0
+lshn-qs-g2ri-2:96977:98373 [2] NCCL INFO Channel 06/64 : 0
+lshn-qs-g2ri-2:96977:98373 [2] NCCL INFO Channel 07/64 : 0
+lshn-qs-g2ri-2:96977:98373 [2] NCCL INFO Channel 08/64 : 0
+lshn-qs-g2ri-2:96977:98373 [2] NCCL INFO Channel 09/64 : 0
+lshn-qs-g2ri-2:96977:98373 [2] NCCL INFO Channel 10/64 : 0
+lshn-qs-g2ri-2:96977:98373 [2] NCCL INFO Channel 11/64 : 0
+lshn-qs-g2ri-2:96977:98373 [2] NCCL INFO Channel 12/64 : 0
+lshn-qs-g2ri-2:96977:98373 [2] NCCL INFO Channel 13/64 : 0
+lshn-qs-g2ri-2:96977:98373 [2] NCCL INFO Channel 14/64 : 0
+lshn-qs-g2ri-2:96977:98373 [2] NCCL INFO Channel 15/64 : 0
+lshn-qs-g2ri-2:96977:98373 [2] NCCL INFO Channel 16/64 : 0
+lshn-qs-g2ri-2:96977:98373 [2] NCCL INFO Channel 17/64 : 0
+lshn-qs-g2ri-2:96977:98373 [2] NCCL INFO Channel 18/64 : 0
+lshn-qs-g2ri-2:96977:98373 [2] NCCL INFO Channel 19/64 : 0
+lshn-qs-g2ri-2:96977:98373 [2] NCCL INFO Channel 20/64 : 0
+lshn-qs-g2ri-2:96977:98373 [2] NCCL INFO Channel 21/64 : 0
+lshn-qs-g2ri-2:96977:98373 [2] NCCL INFO Channel 22/64 : 0
+lshn-qs-g2ri-2:96977:98373 [2] NCCL INFO Channel 23/64 : 0
+lshn-qs-g2ri-2:96977:98373 [2] NCCL INFO Channel 24/64 : 0
+lshn-qs-g2ri-2:96977:98373 [2] NCCL INFO Channel 25/64 : 0
+lshn-qs-g2ri-2:96977:98373 [2] NCCL INFO Channel 26/64 : 0
+lshn-qs-g2ri-2:96977:98373 [2] NCCL INFO Channel 27/64 : 0
+lshn-qs-g2ri-2:96977:98373 [2] NCCL INFO Channel 28/64 : 0
+lshn-qs-g2ri-2:96977:98373 [2] NCCL INFO Channel 29/64 : 0
+lshn-qs-g2ri-2:96977:98373 [2] NCCL INFO Channel 30/64 : 0
+lshn-qs-g2ri-2:96977:98373 [2] NCCL INFO Channel 31/64 : 0
+lshn-qs-g2ri-2:96977:98373 [2] NCCL INFO Channel 32/64 : 0
+lshn-qs-g2ri-2:96977:98373 [2] NCCL INFO Channel 33/64 : 0
+lshn-qs-g2ri-2:96977:98373 [2] NCCL INFO Channel 34/64 : 0
+lshn-qs-g2ri-2:96977:98373 [2] NCCL INFO Channel 35/64 : 0
+lshn-qs-g2ri-2:96977:98373 [2] NCCL INFO Channel 36/64 : 0
+lshn-qs-g2ri-2:96977:98373 [2] NCCL INFO Channel 37/64 : 0
+lshn-qs-g2ri-2:96977:98373 [2] NCCL INFO Channel 38/64 : 0
+lshn-qs-g2ri-2:96977:98373 [2] NCCL INFO Channel 39/64 : 0
+lshn-qs-g2ri-2:96977:98373 [2] NCCL INFO Channel 40/64 : 0
+lshn-qs-g2ri-2:96977:98373 [2] NCCL INFO Channel 41/64 : 0
+lshn-qs-g2ri-2:96977:98373 [2] NCCL INFO Channel 42/64 : 0
+lshn-qs-g2ri-2:96977:98373 [2] NCCL INFO Channel 43/64 : 0
+lshn-qs-g2ri-2:96977:98373 [2] NCCL INFO Channel 44/64 : 0
+lshn-qs-g2ri-2:96977:98373 [2] NCCL INFO Channel 45/64 : 0
+lshn-qs-g2ri-2:96977:98373 [2] NCCL INFO Channel 46/64 : 0
+lshn-qs-g2ri-2:96977:98373 [2] NCCL INFO Channel 47/64 : 0
+lshn-qs-g2ri-2:96977:98373 [2] NCCL INFO Channel 48/64 : 0
+lshn-qs-g2ri-2:96977:98373 [2] NCCL INFO Channel 49/64 : 0
+lshn-qs-g2ri-2:96977:98373 [2] NCCL INFO Channel 50/64 : 0
+lshn-qs-g2ri-2:96977:98373 [2] NCCL INFO Channel 51/64 : 0
+lshn-qs-g2ri-2:96977:98373 [2] NCCL INFO Channel 52/64 : 0
+lshn-qs-g2ri-2:96977:98373 [2] NCCL INFO Channel 53/64 : 0
+lshn-qs-g2ri-2:96977:98373 [2] NCCL INFO Channel 54/64 : 0
+lshn-qs-g2ri-2:96977:98373 [2] NCCL INFO Channel 55/64 : 0
+lshn-qs-g2ri-2:96977:98373 [2] NCCL INFO Channel 56/64 : 0
+lshn-qs-g2ri-2:96977:98373 [2] NCCL INFO Channel 57/64 : 0
+lshn-qs-g2ri-2:96977:98373 [2] NCCL INFO Channel 58/64 : 0
+lshn-qs-g2ri-2:96977:98373 [2] NCCL INFO Channel 59/64 : 0
+lshn-qs-g2ri-2:96977:98373 [2] NCCL INFO Channel 60/64 : 0
+lshn-qs-g2ri-2:96977:98373 [2] NCCL INFO Channel 61/64 : 0
+lshn-qs-g2ri-2:96977:98373 [2] NCCL INFO Channel 62/64 : 0
+lshn-qs-g2ri-2:96977:98373 [2] NCCL INFO Channel 63/64 : 0
+lshn-qs-g2ri-2:96977:98373 [2] NCCL INFO Trees [0] -1/-1/-1->0->-1 [1] -1/-1/-1->0->-1 [2] -1/-1/-1->0->-1 [3] -1/-1/-1->0->-1 [4] -1/-1/-1->0->-1 [5] -1/-1/-1->0->-1 [6] -1/-1/-1->0->-1 [7] -1/-1/-1->0->-1 [8] -1/-1/-1->0->-1 [9] -1/-1/-1->0->-1 [10] -1/-1/-1->0->-1 [11] -1/-1/-1->0->-1 [12] -1/-1/-1->0->-1 [13] -1/-1/-1->0->-1 [14] -1/-1/-1->0->-1 [15] -1/-1/-1->0->-1 [16] -1/-1/-1->0->-1 [17] -1/-1/-1->0->-1 [18] -1/-1/-1->0->-1 [19] -1/-1/-1->0->-1 [20] -1/-1/-1->0->-1 [21] -1/-1/-1->0->-1 [22] -1/-1/-1->0->-1 [23] -1/-1/-1->0->-1 [24] -1/-1/-1->0->-1 [25] -1/-1/-1->0->-1 [26] -1/-1/-1->0->-1 [27] -1/-1/-1->0->-1 [28] -1/-1/-1->0->-1 [29] -1/-1/-1->0->-1 [30] -1/-1/-1->0->-1 [31] -1/-1/-1->0->-1 [32] -1/-1/-1->0->-1 [33] -1/-1/-1->0->-1 [34] -1/-1/-1->0->-1 [35] -1/-1/-1->0->-1 [36] -1/-1/-1->0->-1 [37] -1/-1/-1->0->-1 [38] -1/-1/-1->0->-1 [39] -1/-1/-1->0->-1 [40] -1/-1/-1->0->-1 [41] -1/-1/-1->0->-1 [42] -1/-1/-1->0->-1 [43] -1/-1/-1->0->-1 [44] -1/-1/-1->0->-1 [45] -1/-1/-1->0->-1 [46] -1/-1/-1->0->-1
+lshn-qs-g2ri-2:96977:98373 [2] NCCL INFO P2P Chunksize set to 524288
+lshn-qs-g2ri-2:96977:98373 [2] NCCL INFO Check P2P Type isAllDirectP2p 0 directMode 0
+lshn-qs-g2ri-2:96977:98377 [2] NCCL INFO [Proxy Service] Device 2 CPU core 105
+lshn-qs-g2ri-2:96977:98378 [2] NCCL INFO [Proxy Service UDS] Device 2 CPU core 28
+lshn-qs-g2ri-2:96977:98373 [2] NCCL INFO 64 coll channels, 64 collnet channels, 0 nvls channels, 64 p2p channels, 64 p2p channels per peer
+lshn-qs-g2ri-2:96977:98373 [2] NCCL INFO CC Off, workFifoBytes 1048576
+lshn-qs-g2ri-2:96977:98373 [2] NCCL INFO ncclCommSplit comm 0x21c76920 rank 0 nranks 1 cudaDev 2 nvmlDev 2 busId a2000 parent 0x1f1d4810 splitCount 20 color 1227022723 key 0 - Init COMPLETE
+lshn-qs-g2ri-2:96977:98373 [2] NCCL INFO Init timings - ncclCommSplit: rank 0 nranks 1 total 0.03 (kernels 0.00, alloc 0.00, bootstrap 0.00, allgathers 0.00, topo 0.01, graphs 0.00, connections 0.02, rest 0.00)
+[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
+lshn-qs-g2ri-2:96977:96977 [2] NCCL INFO Rank 2 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-g2ri-2:96977:96977 [2] NCCL INFO Rank 2 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-g2ri-2:96975:96975 [0] NCCL INFO Rank 0 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-g2ri-2:96976:96976 [1] NCCL INFO Rank 1 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-g2ri-2:96978:96978 [3] NCCL INFO Comm config Blocking set to 1
+lshn-qs-g2ri-2:96978:98388 [3] NCCL INFO Assigned NET plugin Socket to comm
+lshn-qs-g2ri-2:96978:98388 [3] NCCL INFO Using network Socket
+lshn-qs-g2ri-2:96976:96976 [1] NCCL INFO Rank 1 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-g2ri-2:96977:96977 [2] NCCL INFO Rank 2 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-g2ri-2:96975:96975 [0] NCCL INFO Comm config Blocking set to 1
+lshn-qs-g2ri-2:96978:98388 [3] NCCL INFO ncclCommSplit comm 0x20bd6290 rank 0 nranks 1 cudaDev 3 nvmlDev 3 busId c6000 parent 0x1e24ba00 splitCount 22 color 1301067556 key 0- Init START
+lshn-qs-g2ri-2:96978:98388 [3] NCCL INFO MNNVL busId 0xc6000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0
+lshn-qs-g2ri-2:96978:98388 [3] NCCL INFO Setting affinity for GPU 3 to 0-47,96-143
+lshn-qs-g2ri-2:96978:98388 [3] NCCL INFO comm 0x20bd6290 rank 0 nRanks 1 nNodes 1 localRanks 1 localRank 0 MNNVL 0
+lshn-qs-g2ri-2:96978:98388 [3] NCCL INFO Channel 00/64 : 0
+lshn-qs-g2ri-2:96978:98388 [3] NCCL INFO Channel 01/64 : 0
+lshn-qs-g2ri-2:96978:98388 [3] NCCL INFO Channel 02/64 : 0
+lshn-qs-g2ri-2:96978:98388 [3] NCCL INFO Channel 03/64 : 0
+lshn-qs-g2ri-2:96978:98388 [3] NCCL INFO Channel 04/64 : 0
+lshn-qs-g2ri-2:96978:98388 [3] NCCL INFO Channel 05/64 : 0
+lshn-qs-g2ri-2:96978:98388 [3] NCCL INFO Channel 06/64 : 0
+lshn-qs-g2ri-2:96978:98388 [3] NCCL INFO Channel 07/64 : 0
+lshn-qs-g2ri-2:96978:98388 [3] NCCL INFO Channel 08/64 : 0
+lshn-qs-g2ri-2:96978:98388 [3] NCCL INFO Channel 09/64 : 0
+lshn-qs-g2ri-2:96978:98388 [3] NCCL INFO Channel 10/64 : 0
+lshn-qs-g2ri-2:96978:98388 [3] NCCL INFO Channel 11/64 : 0
+lshn-qs-g2ri-2:96978:98388 [3] NCCL INFO Channel 12/64 : 0
+lshn-qs-g2ri-2:96978:98388 [3] NCCL INFO Channel 13/64 : 0
+lshn-qs-g2ri-2:96978:98388 [3] NCCL INFO Channel 14/64 : 0
+lshn-qs-g2ri-2:96978:98388 [3] NCCL INFO Channel 15/64 : 0
+lshn-qs-g2ri-2:96978:98388 [3] NCCL INFO Channel 16/64 : 0
+lshn-qs-g2ri-2:96978:98388 [3] NCCL INFO Channel 17/64 : 0
+lshn-qs-g2ri-2:96978:98388 [3] NCCL INFO Channel 18/64 : 0
+lshn-qs-g2ri-2:96978:98388 [3] NCCL INFO Channel 19/64 : 0
+lshn-qs-g2ri-2:96978:98388 [3] NCCL INFO Channel 20/64 : 0
+lshn-qs-g2ri-2:96978:98388 [3] NCCL INFO Channel 21/64 : 0
+lshn-qs-g2ri-2:96978:98388 [3] NCCL INFO Channel 22/64 : 0
+lshn-qs-g2ri-2:96978:98388 [3] NCCL INFO Channel 23/64 : 0
+lshn-qs-g2ri-2:96978:98388 [3] NCCL INFO Channel 24/64 : 0
+lshn-qs-g2ri-2:96978:98388 [3] NCCL INFO Channel 25/64 : 0
+lshn-qs-g2ri-2:96978:98388 [3] NCCL INFO Channel 26/64 : 0
+lshn-qs-g2ri-2:96978:98388 [3] NCCL INFO Channel 27/64 : 0
+lshn-qs-g2ri-2:96978:98388 [3] NCCL INFO Channel 28/64 : 0
+lshn-qs-g2ri-2:96978:98388 [3] NCCL INFO Channel 29/64 : 0
+lshn-qs-g2ri-2:96978:98388 [3] NCCL INFO Channel 30/64 : 0
+lshn-qs-g2ri-2:96978:98388 [3] NCCL INFO Channel 31/64 : 0
+lshn-qs-g2ri-2:96978:98388 [3] NCCL INFO Channel 32/64 : 0
+lshn-qs-g2ri-2:96978:98388 [3] NCCL INFO Channel 33/64 : 0
+lshn-qs-g2ri-2:96978:98388 [3] NCCL INFO Channel 34/64 : 0
+lshn-qs-g2ri-2:96978:98388 [3] NCCL INFO Channel 35/64 : 0
+lshn-qs-g2ri-2:96978:98388 [3] NCCL INFO Channel 36/64 : 0
+lshn-qs-g2ri-2:96978:98388 [3] NCCL INFO Channel 37/64 : 0
+lshn-qs-g2ri-2:96978:98388 [3] NCCL INFO Channel 38/64 : 0
+lshn-qs-g2ri-2:96978:98388 [3] NCCL INFO Channel 39/64 : 0
+lshn-qs-g2ri-2:96978:98388 [3] NCCL INFO Channel 40/64 : 0
+lshn-qs-g2ri-2:96978:98388 [3] NCCL INFO Channel 41/64 : 0
+lshn-qs-g2ri-2:96978:98388 [3] NCCL INFO Channel 42/64 : 0
+lshn-qs-g2ri-2:96978:98388 [3] NCCL INFO Channel 43/64 : 0
+lshn-qs-g2ri-2:96978:98388 [3] NCCL INFO Channel 44/64 : 0
+lshn-qs-g2ri-2:96978:98388 [3] NCCL INFO Channel 45/64 : 0
+lshn-qs-g2ri-2:96978:98388 [3] NCCL INFO Channel 46/64 : 0
+lshn-qs-g2ri-2:96978:98388 [3] NCCL INFO Channel 47/64 : 0
+lshn-qs-g2ri-2:96978:98388 [3] NCCL INFO Channel 48/64 : 0
+lshn-qs-g2ri-2:96978:98388 [3] NCCL INFO Channel 49/64 : 0
+lshn-qs-g2ri-2:96978:98388 [3] NCCL INFO Channel 50/64 : 0
+lshn-qs-g2ri-2:96978:98388 [3] NCCL INFO Channel 51/64 : 0
+lshn-qs-g2ri-2:96978:98388 [3] NCCL INFO Channel 52/64 : 0
+lshn-qs-g2ri-2:96978:98388 [3] NCCL INFO Channel 53/64 : 0
+lshn-qs-g2ri-2:96978:98388 [3] NCCL INFO Channel 54/64 : 0
+lshn-qs-g2ri-2:96978:98388 [3] NCCL INFO Channel 55/64 : 0
+lshn-qs-g2ri-2:96978:98388 [3] NCCL INFO Channel 56/64 : 0
+lshn-qs-g2ri-2:96978:98388 [3] NCCL INFO Channel 57/64 : 0
+lshn-qs-g2ri-2:96978:98388 [3] NCCL INFO Channel 58/64 : 0
+lshn-qs-g2ri-2:96978:98388 [3] NCCL INFO Channel 59/64 : 0
+lshn-qs-g2ri-2:96978:98388 [3] NCCL INFO Channel 60/64 : 0
+lshn-qs-g2ri-2:96978:98388 [3] NCCL INFO Channel 61/64 : 0
+lshn-qs-g2ri-2:96978:98388 [3] NCCL INFO Channel 62/64 : 0
+lshn-qs-g2ri-2:96978:98388 [3] NCCL INFO Channel 63/64 : 0
+lshn-qs-g2ri-2:96978:98388 [3] NCCL INFO Trees [0] -1/-1/-1->0->-1 [1] -1/-1/-1->0->-1 [2] -1/-1/-1->0->-1 [3] -1/-1/-1->0->-1 [4] -1/-1/-1->0->-1 [5] -1/-1/-1->0->-1 [6] -1/-1/-1->0->-1 [7] -1/-1/-1->0->-1 [8] -1/-1/-1->0->-1 [9] -1/-1/-1->0->-1 [10] -1/-1/-1->0->-1 [11] -1/-1/-1->0->-1 [12] -1/-1/-1->0->-1 [13] -1/-1/-1->0->-1 [14] -1/-1/-1->0->-1 [15] -1/-1/-1->0->-1 [16] -1/-1/-1->0->-1 [17] -1/-1/-1->0->-1 [18] -1/-1/-1->0->-1 [19] -1/-1/-1->0->-1 [20] -1/-1/-1->0->-1 [21] -1/-1/-1->0->-1 [22] -1/-1/-1->0->-1 [23] -1/-1/-1->0->-1 [24] -1/-1/-1->0->-1 [25] -1/-1/-1->0->-1 [26] -1/-1/-1->0->-1 [27] -1/-1/-1->0->-1 [28] -1/-1/-1->0->-1 [29] -1/-1/-1->0->-1 [30] -1/-1/-1->0->-1 [31] -1/-1/-1->0->-1 [32] -1/-1/-1->0->-1 [33] -1/-1/-1->0->-1 [34] -1/-1/-1->0->-1 [35] -1/-1/-1->0->-1 [36] -1/-1/-1->0->-1 [37] -1/-1/-1->0->-1 [38] -1/-1/-1->0->-1 [39] -1/-1/-1->0->-1 [40] -1/-1/-1->0->-1 [41] -1/-1/-1->0->-1 [42] -1/-1/-1->0->-1 [43] -1/-1/-1->0->-1 [44] -1/-1/-1->0->-1 [45] -1/-1/-1->0->-1 [46] -1/-1/-1->0->-1
+lshn-qs-g2ri-2:96978:98388 [3] NCCL INFO P2P Chunksize set to 524288
+lshn-qs-g2ri-2:96978:98388 [3] NCCL INFO Check P2P Type isAllDirectP2p 0 directMode 0
+lshn-qs-g2ri-2:96978:98394 [3] NCCL INFO [Proxy Service] Device 3 CPU core 26
+lshn-qs-g2ri-2:96978:98395 [3] NCCL INFO [Proxy Service UDS] Device 3 CPU core 16
+lshn-qs-g2ri-2:96978:98388 [3] NCCL INFO 64 coll channels, 64 collnet channels, 0 nvls channels, 64 p2p channels, 64 p2p channels per peer
+lshn-qs-g2ri-2:96978:98388 [3] NCCL INFO CC Off, workFifoBytes 1048576
+lshn-qs-g2ri-2:96978:98388 [3] NCCL INFO ncclCommSplit comm 0x20bd6290 rank 0 nranks 1 cudaDev 3 nvmlDev 3 busId c6000 parent 0x1e24ba00 splitCount 22 color 1301067556 key 0 - Init COMPLETE
+lshn-qs-g2ri-2:96978:98388 [3] NCCL INFO Init timings - ncclCommSplit: rank 0 nranks 1 total 0.04 (kernels 0.00, alloc 0.01, bootstrap 0.00, allgathers 0.00, topo 0.01, graphs 0.00, connections 0.02, rest 0.00)
+[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
+lshn-qs-g2ri-2:96978:96978 [3] NCCL INFO Rank 3 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-g2ri-2:96975:98393 [0] NCCL INFO Assigned NET plugin Socket to comm
+lshn-qs-g2ri-2:96976:96976 [1] NCCL INFO Rank 1 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-g2ri-2:96977:96977 [2] NCCL INFO Rank 2 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-g2ri-2:96978:96978 [3] NCCL INFO Rank 3 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-g2ri-2:96975:98393 [0] NCCL INFO Using network Socket
+lshn-qs-g2ri-2:96975:98393 [0] NCCL INFO ncclCommSplit comm 0x21069ec0 rank 0 nranks 1 cudaDev 0 nvmlDev 0 busId 8000 parent 0x1df75b30 splitCount 23 color 59908776 key 0- Init START
+lshn-qs-g2ri-2:96975:98393 [0] NCCL INFO MNNVL busId 0x8000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0
+lshn-qs-g2ri-2:96975:98393 [0] NCCL INFO Setting affinity for GPU 0 to 0-47,96-143
+lshn-qs-g2ri-2:96975:98393 [0] NCCL INFO comm 0x21069ec0 rank 0 nRanks 1 nNodes 1 localRanks 1 localRank 0 MNNVL 0
+lshn-qs-g2ri-2:96975:98393 [0] NCCL INFO Channel 00/64 : 0
+lshn-qs-g2ri-2:96975:98393 [0] NCCL INFO Channel 01/64 : 0
+lshn-qs-g2ri-2:96975:98393 [0] NCCL INFO Channel 02/64 : 0
+lshn-qs-g2ri-2:96975:98393 [0] NCCL INFO Channel 03/64 : 0
+lshn-qs-g2ri-2:96975:98393 [0] NCCL INFO Channel 04/64 : 0
+lshn-qs-g2ri-2:96975:98393 [0] NCCL INFO Channel 05/64 : 0
+lshn-qs-g2ri-2:96975:98393 [0] NCCL INFO Channel 06/64 : 0
+lshn-qs-g2ri-2:96975:98393 [0] NCCL INFO Channel 07/64 : 0
+lshn-qs-g2ri-2:96975:98393 [0] NCCL INFO Channel 08/64 : 0
+lshn-qs-g2ri-2:96975:98393 [0] NCCL INFO Channel 09/64 : 0
+lshn-qs-g2ri-2:96975:98393 [0] NCCL INFO Channel 10/64 : 0
+lshn-qs-g2ri-2:96975:98393 [0] NCCL INFO Channel 11/64 : 0
+lshn-qs-g2ri-2:96975:98393 [0] NCCL INFO Channel 12/64 : 0
+lshn-qs-g2ri-2:96975:98393 [0] NCCL INFO Channel 13/64 : 0
+lshn-qs-g2ri-2:96975:98393 [0] NCCL INFO Channel 14/64 : 0
+lshn-qs-g2ri-2:96975:98393 [0] NCCL INFO Channel 15/64 : 0
+lshn-qs-g2ri-2:96975:98393 [0] NCCL INFO Channel 16/64 : 0
+lshn-qs-g2ri-2:96975:98393 [0] NCCL INFO Channel 17/64 : 0
+lshn-qs-g2ri-2:96975:98393 [0] NCCL INFO Channel 18/64 : 0
+lshn-qs-g2ri-2:96975:98393 [0] NCCL INFO Channel 19/64 : 0
+lshn-qs-g2ri-2:96975:98393 [0] NCCL INFO Channel 20/64 : 0
+lshn-qs-g2ri-2:96975:98393 [0] NCCL INFO Channel 21/64 : 0
+lshn-qs-g2ri-2:96975:98393 [0] NCCL INFO Channel 22/64 : 0
+lshn-qs-g2ri-2:96975:98393 [0] NCCL INFO Channel 23/64 : 0
+lshn-qs-g2ri-2:96975:98393 [0] NCCL INFO Channel 24/64 : 0
+lshn-qs-g2ri-2:96975:98393 [0] NCCL INFO Channel 25/64 : 0
+lshn-qs-g2ri-2:96975:98393 [0] NCCL INFO Channel 26/64 : 0
+lshn-qs-g2ri-2:96975:98393 [0] NCCL INFO Channel 27/64 : 0
+lshn-qs-g2ri-2:96975:98393 [0] NCCL INFO Channel 28/64 : 0
+lshn-qs-g2ri-2:96975:98393 [0] NCCL INFO Channel 29/64 : 0
+lshn-qs-g2ri-2:96975:98393 [0] NCCL INFO Channel 30/64 : 0
+lshn-qs-g2ri-2:96975:98393 [0] NCCL INFO Channel 31/64 : 0
+lshn-qs-g2ri-2:96975:98393 [0] NCCL INFO Channel 32/64 : 0
+lshn-qs-g2ri-2:96975:98393 [0] NCCL INFO Channel 33/64 : 0
+lshn-qs-g2ri-2:96975:98393 [0] NCCL INFO Channel 34/64 : 0
+lshn-qs-g2ri-2:96975:98393 [0] NCCL INFO Channel 35/64 : 0
+lshn-qs-g2ri-2:96975:98393 [0] NCCL INFO Channel 36/64 : 0
+lshn-qs-g2ri-2:96975:98393 [0] NCCL INFO Channel 37/64 : 0
+lshn-qs-g2ri-2:96975:98393 [0] NCCL INFO Channel 38/64 : 0
+lshn-qs-g2ri-2:96975:98393 [0] NCCL INFO Channel 39/64 : 0
+lshn-qs-g2ri-2:96975:98393 [0] NCCL INFO Channel 40/64 : 0
+lshn-qs-g2ri-2:96975:98393 [0] NCCL INFO Channel 41/64 : 0
+lshn-qs-g2ri-2:96975:98393 [0] NCCL INFO Channel 42/64 : 0
+lshn-qs-g2ri-2:96975:98393 [0] NCCL INFO Channel 43/64 : 0
+lshn-qs-g2ri-2:96975:98393 [0] NCCL INFO Channel 44/64 : 0
+lshn-qs-g2ri-2:96975:98393 [0] NCCL INFO Channel 45/64 : 0
+lshn-qs-g2ri-2:96975:98393 [0] NCCL INFO Channel 46/64 : 0
+lshn-qs-g2ri-2:96975:98393 [0] NCCL INFO Channel 47/64 : 0
+lshn-qs-g2ri-2:96975:98393 [0] NCCL INFO Channel 48/64 : 0
+lshn-qs-g2ri-2:96975:98393 [0] NCCL INFO Channel 49/64 : 0
+lshn-qs-g2ri-2:96975:98393 [0] NCCL INFO Channel 50/64 : 0
+lshn-qs-g2ri-2:96975:98393 [0] NCCL INFO Channel 51/64 : 0
+lshn-qs-g2ri-2:96975:98393 [0] NCCL INFO Channel 52/64 : 0
+lshn-qs-g2ri-2:96975:98393 [0] NCCL INFO Channel 53/64 : 0
+lshn-qs-g2ri-2:96975:98393 [0] NCCL INFO Channel 54/64 : 0
+lshn-qs-g2ri-2:96975:98393 [0] NCCL INFO Channel 55/64 : 0
+lshn-qs-g2ri-2:96975:98393 [0] NCCL INFO Channel 56/64 : 0
+lshn-qs-g2ri-2:96975:98393 [0] NCCL INFO Channel 57/64 : 0
+lshn-qs-g2ri-2:96975:98393 [0] NCCL INFO Channel 58/64 : 0
+lshn-qs-g2ri-2:96975:98393 [0] NCCL INFO Channel 59/64 : 0
+lshn-qs-g2ri-2:96975:98393 [0] NCCL INFO Channel 60/64 : 0
+lshn-qs-g2ri-2:96975:98393 [0] NCCL INFO Channel 61/64 : 0
+lshn-qs-g2ri-2:96975:98393 [0] NCCL INFO Channel 62/64 : 0
+lshn-qs-g2ri-2:96975:98393 [0] NCCL INFO Channel 63/64 : 0
+lshn-qs-g2ri-2:96975:98393 [0] NCCL INFO Trees [0] -1/-1/-1->0->-1 [1] -1/-1/-1->0->-1 [2] -1/-1/-1->0->-1 [3] -1/-1/-1->0->-1 [4] -1/-1/-1->0->-1 [5] -1/-1/-1->0->-1 [6] -1/-1/-1->0->-1 [7] -1/-1/-1->0->-1 [8] -1/-1/-1->0->-1 [9] -1/-1/-1->0->-1 [10] -1/-1/-1->0->-1 [11] -1/-1/-1->0->-1 [12] -1/-1/-1->0->-1 [13] -1/-1/-1->0->-1 [14] -1/-1/-1->0->-1 [15] -1/-1/-1->0->-1 [16] -1/-1/-1->0->-1 [17] -1/-1/-1->0->-1 [18] -1/-1/-1->0->-1 [19] -1/-1/-1->0->-1 [20] -1/-1/-1->0->-1 [21] -1/-1/-1->0->-1 [22] -1/-1/-1->0->-1 [23] -1/-1/-1->0->-1 [24] -1/-1/-1->0->-1 [25] -1/-1/-1->0->-1 [26] -1/-1/-1->0->-1 [27] -1/-1/-1->0->-1 [28] -1/-1/-1->0->-1 [29] -1/-1/-1->0->-1 [30] -1/-1/-1->0->-1 [31] -1/-1/-1->0->-1 [32] -1/-1/-1->0->-1 [33] -1/-1/-1->0->-1 [34] -1/-1/-1->0->-1 [35] -1/-1/-1->0->-1 [36] -1/-1/-1->0->-1 [37] -1/-1/-1->0->-1 [38] -1/-1/-1->0->-1 [39] -1/-1/-1->0->-1 [40] -1/-1/-1->0->-1 [41] -1/-1/-1->0->-1 [42] -1/-1/-1->0->-1 [43] -1/-1/-1->0->-1 [44] -1/-1/-1->0->-1 [45] -1/-1/-1->0->-1 [46] -1/-1/-1->0->-1
+lshn-qs-g2ri-2:96975:98393 [0] NCCL INFO P2P Chunksize set to 524288
+lshn-qs-g2ri-2:96975:98393 [0] NCCL INFO Check P2P Type isAllDirectP2p 0 directMode 0
+lshn-qs-g2ri-2:96975:98403 [0] NCCL INFO [Proxy Service] Device 0 CPU core 140
+lshn-qs-g2ri-2:96975:98404 [0] NCCL INFO [Proxy Service UDS] Device 0 CPU core 13
+lshn-qs-g2ri-2:96975:98393 [0] NCCL INFO 64 coll channels, 64 collnet channels, 0 nvls channels, 64 p2p channels, 64 p2p channels per peer
+lshn-qs-g2ri-2:96975:98393 [0] NCCL INFO CC Off, workFifoBytes 1048576
+lshn-qs-g2ri-2:96975:98393 [0] NCCL INFO ncclCommSplit comm 0x21069ec0 rank 0 nranks 1 cudaDev 0 nvmlDev 0 busId 8000 parent 0x1df75b30 splitCount 23 color 59908776 key 0 - Init COMPLETE
+lshn-qs-g2ri-2:96975:98393 [0] NCCL INFO Init timings - ncclCommSplit: rank 0 nranks 1 total 0.09 (kernels 0.00, alloc 0.00, bootstrap 0.00, allgathers 0.00, topo 0.01, graphs 0.00, connections 0.02, rest 0.06)
+[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
+lshn-qs-g2ri-2:96975:96975 [0] NCCL INFO Rank 0 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-g2ri-2:96977:96977 [2] NCCL INFO Rank 2 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-g2ri-2:96978:96978 [3] NCCL INFO Rank 3 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-g2ri-2:96975:96975 [0] NCCL INFO Rank 0 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-g2ri-2:96976:96976 [1] NCCL INFO Comm config Blocking set to 1
+lshn-qs-g2ri-2:96976:98414 [1] NCCL INFO Assigned NET plugin Socket to comm
+lshn-qs-g2ri-2:96976:98414 [1] NCCL INFO Using network Socket
+lshn-qs-g2ri-2:96975:96975 [0] NCCL INFO Rank 0 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-g2ri-2:96978:96978 [3] NCCL INFO Rank 3 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-g2ri-2:96977:96977 [2] NCCL INFO Rank 2 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-g2ri-2:96976:98414 [1] NCCL INFO ncclCommSplit comm 0x20700de0 rank 0 nranks 1 cudaDev 1 nvmlDev 1 busId 7e000 parent 0x1dbf3b00 splitCount 25 color 440515407 key 0- Init START
+lshn-qs-g2ri-2:96976:98414 [1] NCCL INFO MNNVL busId 0x7e000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0
+lshn-qs-g2ri-2:96976:98414 [1] NCCL INFO Setting affinity for GPU 1 to 0-47,96-143
+lshn-qs-g2ri-2:96976:98414 [1] NCCL INFO comm 0x20700de0 rank 0 nRanks 1 nNodes 1 localRanks 1 localRank 0 MNNVL 0
+lshn-qs-g2ri-2:96976:98414 [1] NCCL INFO Channel 00/64 : 0
+lshn-qs-g2ri-2:96976:98414 [1] NCCL INFO Channel 01/64 : 0
+lshn-qs-g2ri-2:96976:98414 [1] NCCL INFO Channel 02/64 : 0
+lshn-qs-g2ri-2:96976:98414 [1] NCCL INFO Channel 03/64 : 0
+lshn-qs-g2ri-2:96976:98414 [1] NCCL INFO Channel 04/64 : 0
+lshn-qs-g2ri-2:96976:98414 [1] NCCL INFO Channel 05/64 : 0
+lshn-qs-g2ri-2:96976:98414 [1] NCCL INFO Channel 06/64 : 0
+lshn-qs-g2ri-2:96976:98414 [1] NCCL INFO Channel 07/64 : 0
+lshn-qs-g2ri-2:96976:98414 [1] NCCL INFO Channel 08/64 : 0
+lshn-qs-g2ri-2:96976:98414 [1] NCCL INFO Channel 09/64 : 0
+lshn-qs-g2ri-2:96976:98414 [1] NCCL INFO Channel 10/64 : 0
+lshn-qs-g2ri-2:96976:98414 [1] NCCL INFO Channel 11/64 : 0
+lshn-qs-g2ri-2:96976:98414 [1] NCCL INFO Channel 12/64 : 0
+lshn-qs-g2ri-2:96976:98414 [1] NCCL INFO Channel 13/64 : 0
+lshn-qs-g2ri-2:96976:98414 [1] NCCL INFO Channel 14/64 : 0
+lshn-qs-g2ri-2:96976:98414 [1] NCCL INFO Channel 15/64 : 0
+lshn-qs-g2ri-2:96976:98414 [1] NCCL INFO Channel 16/64 : 0
+lshn-qs-g2ri-2:96976:98414 [1] NCCL INFO Channel 17/64 : 0
+lshn-qs-g2ri-2:96976:98414 [1] NCCL INFO Channel 18/64 : 0
+lshn-qs-g2ri-2:96976:98414 [1] NCCL INFO Channel 19/64 : 0
+lshn-qs-g2ri-2:96976:98414 [1] NCCL INFO Channel 20/64 : 0
+lshn-qs-g2ri-2:96976:98414 [1] NCCL INFO Channel 21/64 : 0
+lshn-qs-g2ri-2:96976:98414 [1] NCCL INFO Channel 22/64 : 0
+lshn-qs-g2ri-2:96976:98414 [1] NCCL INFO Channel 23/64 : 0
+lshn-qs-g2ri-2:96976:98414 [1] NCCL INFO Channel 24/64 : 0
+lshn-qs-g2ri-2:96976:98414 [1] NCCL INFO Channel 25/64 : 0
+lshn-qs-g2ri-2:96976:98414 [1] NCCL INFO Channel 26/64 : 0
+lshn-qs-g2ri-2:96976:98414 [1] NCCL INFO Channel 27/64 : 0
+lshn-qs-g2ri-2:96976:98414 [1] NCCL INFO Channel 28/64 : 0
+lshn-qs-g2ri-2:96976:98414 [1] NCCL INFO Channel 29/64 : 0
+lshn-qs-g2ri-2:96976:98414 [1] NCCL INFO Channel 30/64 : 0
+lshn-qs-g2ri-2:96976:98414 [1] NCCL INFO Channel 31/64 : 0
+lshn-qs-g2ri-2:96976:98414 [1] NCCL INFO Channel 32/64 : 0
+lshn-qs-g2ri-2:96976:98414 [1] NCCL INFO Channel 33/64 : 0
+lshn-qs-g2ri-2:96976:98414 [1] NCCL INFO Channel 34/64 : 0
+lshn-qs-g2ri-2:96976:98414 [1] NCCL INFO Channel 35/64 : 0
+lshn-qs-g2ri-2:96976:98414 [1] NCCL INFO Channel 36/64 : 0
+lshn-qs-g2ri-2:96976:98414 [1] NCCL INFO Channel 37/64 : 0
+lshn-qs-g2ri-2:96976:98414 [1] NCCL INFO Channel 38/64 : 0
+lshn-qs-g2ri-2:96976:98414 [1] NCCL INFO Channel 39/64 : 0
+lshn-qs-g2ri-2:96976:98414 [1] NCCL INFO Channel 40/64 : 0
+lshn-qs-g2ri-2:96976:98414 [1] NCCL INFO Channel 41/64 : 0
+lshn-qs-g2ri-2:96976:98414 [1] NCCL INFO Channel 42/64 : 0
+lshn-qs-g2ri-2:96976:98414 [1] NCCL INFO Channel 43/64 : 0
+lshn-qs-g2ri-2:96976:98414 [1] NCCL INFO Channel 44/64 : 0
+lshn-qs-g2ri-2:96976:98414 [1] NCCL INFO Channel 45/64 : 0
+lshn-qs-g2ri-2:96976:98414 [1] NCCL INFO Channel 46/64 : 0
+lshn-qs-g2ri-2:96976:98414 [1] NCCL INFO Channel 47/64 : 0
+lshn-qs-g2ri-2:96976:98414 [1] NCCL INFO Channel 48/64 : 0
+lshn-qs-g2ri-2:96976:98414 [1] NCCL INFO Channel 49/64 : 0
+lshn-qs-g2ri-2:96976:98414 [1] NCCL INFO Channel 50/64 : 0
+lshn-qs-g2ri-2:96976:98414 [1] NCCL INFO Channel 51/64 : 0
+lshn-qs-g2ri-2:96976:98414 [1] NCCL INFO Channel 52/64 : 0
+lshn-qs-g2ri-2:96976:98414 [1] NCCL INFO Channel 53/64 : 0
+lshn-qs-g2ri-2:96976:98414 [1] NCCL INFO Channel 54/64 : 0
+lshn-qs-g2ri-2:96976:98414 [1] NCCL INFO Channel 55/64 : 0
+lshn-qs-g2ri-2:96976:98414 [1] NCCL INFO Channel 56/64 : 0
+lshn-qs-g2ri-2:96976:98414 [1] NCCL INFO Channel 57/64 : 0
+lshn-qs-g2ri-2:96976:98414 [1] NCCL INFO Channel 58/64 : 0
+lshn-qs-g2ri-2:96976:98414 [1] NCCL INFO Channel 59/64 : 0
+lshn-qs-g2ri-2:96976:98414 [1] NCCL INFO Channel 60/64 : 0
+lshn-qs-g2ri-2:96976:98414 [1] NCCL INFO Channel 61/64 : 0
+lshn-qs-g2ri-2:96976:98414 [1] NCCL INFO Channel 62/64 : 0
+lshn-qs-g2ri-2:96976:98414 [1] NCCL INFO Channel 63/64 : 0
+lshn-qs-g2ri-2:96976:98414 [1] NCCL INFO Trees [0] -1/-1/-1->0->-1 [1] -1/-1/-1->0->-1 [2] -1/-1/-1->0->-1 [3] -1/-1/-1->0->-1 [4] -1/-1/-1->0->-1 [5] -1/-1/-1->0->-1 [6] -1/-1/-1->0->-1 [7] -1/-1/-1->0->-1 [8] -1/-1/-1->0->-1 [9] -1/-1/-1->0->-1 [10] -1/-1/-1->0->-1 [11] -1/-1/-1->0->-1 [12] -1/-1/-1->0->-1 [13] -1/-1/-1->0->-1 [14] -1/-1/-1->0->-1 [15] -1/-1/-1->0->-1 [16] -1/-1/-1->0->-1 [17] -1/-1/-1->0->-1 [18] -1/-1/-1->0->-1 [19] -1/-1/-1->0->-1 [20] -1/-1/-1->0->-1 [21] -1/-1/-1->0->-1 [22] -1/-1/-1->0->-1 [23] -1/-1/-1->0->-1 [24] -1/-1/-1->0->-1 [25] -1/-1/-1->0->-1 [26] -1/-1/-1->0->-1 [27] -1/-1/-1->0->-1 [28] -1/-1/-1->0->-1 [29] -1/-1/-1->0->-1 [30] -1/-1/-1->0->-1 [31] -1/-1/-1->0->-1 [32] -1/-1/-1->0->-1 [33] -1/-1/-1->0->-1 [34] -1/-1/-1->0->-1 [35] -1/-1/-1->0->-1 [36] -1/-1/-1->0->-1 [37] -1/-1/-1->0->-1 [38] -1/-1/-1->0->-1 [39] -1/-1/-1->0->-1 [40] -1/-1/-1->0->-1 [41] -1/-1/-1->0->-1 [42] -1/-1/-1->0->-1 [43] -1/-1/-1->0->-1 [44] -1/-1/-1->0->-1 [45] -1/-1/-1->0->-1 [46] -1/-1/-1->0->-1
+lshn-qs-g2ri-2:96976:98414 [1] NCCL INFO P2P Chunksize set to 524288
+lshn-qs-g2ri-2:96976:98414 [1] NCCL INFO Check P2P Type isAllDirectP2p 0 directMode 0
+lshn-qs-g2ri-2:96976:98418 [1] NCCL INFO [Proxy Service] Device 1 CPU core 132
+lshn-qs-g2ri-2:96976:98419 [1] NCCL INFO [Proxy Service UDS] Device 1 CPU core 31
+lshn-qs-g2ri-2:96976:98414 [1] NCCL INFO 64 coll channels, 64 collnet channels, 0 nvls channels, 64 p2p channels, 64 p2p channels per peer
+lshn-qs-g2ri-2:96976:98414 [1] NCCL INFO CC Off, workFifoBytes 1048576
+lshn-qs-g2ri-2:96976:98414 [1] NCCL INFO ncclCommSplit comm 0x20700de0 rank 0 nranks 1 cudaDev 1 nvmlDev 1 busId 7e000 parent 0x1dbf3b00 splitCount 25 color 440515407 key 0 - Init COMPLETE
+lshn-qs-g2ri-2:96976:98414 [1] NCCL INFO Init timings - ncclCommSplit: rank 0 nranks 1 total 0.03 (kernels 0.00, alloc 0.00, bootstrap 0.00, allgathers 0.00, topo 0.01, graphs 0.00, connections 0.02, rest 0.00)
+[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
+lshn-qs-g2ri-2:96976:96976 [1] NCCL INFO Rank 1 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-g2ri-2:96978:96978 [3] NCCL INFO Rank 3 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-g2ri-2:96976:96976 [1] NCCL INFO Rank 1 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-g2ri-2:96975:96975 [0] NCCL INFO Rank 0 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-g2ri-2:96977:96977 [2] NCCL INFO Comm config Blocking set to 1
+lshn-qs-g2ri-2:96977:98429 [2] NCCL INFO Assigned NET plugin Socket to comm
+lshn-qs-g2ri-2:96977:98429 [2] NCCL INFO Using network Socket
+lshn-qs-g2ri-2:96978:96978 [3] NCCL INFO Rank 3 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-g2ri-2:96976:96976 [1] NCCL INFO Rank 1 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-g2ri-2:96975:96975 [0] NCCL INFO Rank 0 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-g2ri-2:96977:98429 [2] NCCL INFO ncclCommSplit comm 0x21d7e530 rank 0 nranks 1 cudaDev 2 nvmlDev 2 busId a2000 parent 0x1f1d4810 splitCount 27 color 1227022723 key 0- Init START
+lshn-qs-g2ri-2:96977:98429 [2] NCCL INFO MNNVL busId 0xa2000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0
+lshn-qs-g2ri-2:96977:98429 [2] NCCL INFO Setting affinity for GPU 2 to 0-47,96-143
+lshn-qs-g2ri-2:96977:98429 [2] NCCL INFO comm 0x21d7e530 rank 0 nRanks 1 nNodes 1 localRanks 1 localRank 0 MNNVL 0
+lshn-qs-g2ri-2:96977:98429 [2] NCCL INFO Channel 00/64 : 0
+lshn-qs-g2ri-2:96977:98429 [2] NCCL INFO Channel 01/64 : 0
+lshn-qs-g2ri-2:96977:98429 [2] NCCL INFO Channel 02/64 : 0
+lshn-qs-g2ri-2:96977:98429 [2] NCCL INFO Channel 03/64 : 0
+lshn-qs-g2ri-2:96977:98429 [2] NCCL INFO Channel 04/64 : 0
+lshn-qs-g2ri-2:96977:98429 [2] NCCL INFO Channel 05/64 : 0
+lshn-qs-g2ri-2:96977:98429 [2] NCCL INFO Channel 06/64 : 0
+lshn-qs-g2ri-2:96977:98429 [2] NCCL INFO Channel 07/64 : 0
+lshn-qs-g2ri-2:96977:98429 [2] NCCL INFO Channel 08/64 : 0
+lshn-qs-g2ri-2:96977:98429 [2] NCCL INFO Channel 09/64 : 0
+lshn-qs-g2ri-2:96977:98429 [2] NCCL INFO Channel 10/64 : 0
+lshn-qs-g2ri-2:96977:98429 [2] NCCL INFO Channel 11/64 : 0
+lshn-qs-g2ri-2:96977:98429 [2] NCCL INFO Channel 12/64 : 0
+lshn-qs-g2ri-2:96977:98429 [2] NCCL INFO Channel 13/64 : 0
+lshn-qs-g2ri-2:96977:98429 [2] NCCL INFO Channel 14/64 : 0
+lshn-qs-g2ri-2:96977:98429 [2] NCCL INFO Channel 15/64 : 0
+lshn-qs-g2ri-2:96977:98429 [2] NCCL INFO Channel 16/64 : 0
+lshn-qs-g2ri-2:96977:98429 [2] NCCL INFO Channel 17/64 : 0
+lshn-qs-g2ri-2:96977:98429 [2] NCCL INFO Channel 18/64 : 0
+lshn-qs-g2ri-2:96977:98429 [2] NCCL INFO Channel 19/64 : 0
+lshn-qs-g2ri-2:96977:98429 [2] NCCL INFO Channel 20/64 : 0
+lshn-qs-g2ri-2:96977:98429 [2] NCCL INFO Channel 21/64 : 0
+lshn-qs-g2ri-2:96977:98429 [2] NCCL INFO Channel 22/64 : 0
+lshn-qs-g2ri-2:96977:98429 [2] NCCL INFO Channel 23/64 : 0
+lshn-qs-g2ri-2:96977:98429 [2] NCCL INFO Channel 24/64 : 0
+lshn-qs-g2ri-2:96977:98429 [2] NCCL INFO Channel 25/64 : 0
+lshn-qs-g2ri-2:96977:98429 [2] NCCL INFO Channel 26/64 : 0
+lshn-qs-g2ri-2:96977:98429 [2] NCCL INFO Channel 27/64 : 0
+lshn-qs-g2ri-2:96977:98429 [2] NCCL INFO Channel 28/64 : 0
+lshn-qs-g2ri-2:96977:98429 [2] NCCL INFO Channel 29/64 : 0
+lshn-qs-g2ri-2:96977:98429 [2] NCCL INFO Channel 30/64 : 0
+lshn-qs-g2ri-2:96977:98429 [2] NCCL INFO Channel 31/64 : 0
+lshn-qs-g2ri-2:96977:98429 [2] NCCL INFO Channel 32/64 : 0
+lshn-qs-g2ri-2:96977:98429 [2] NCCL INFO Channel 33/64 : 0
+lshn-qs-g2ri-2:96977:98429 [2] NCCL INFO Channel 34/64 : 0
+lshn-qs-g2ri-2:96977:98429 [2] NCCL INFO Channel 35/64 : 0
+lshn-qs-g2ri-2:96977:98429 [2] NCCL INFO Channel 36/64 : 0
+lshn-qs-g2ri-2:96977:98429 [2] NCCL INFO Channel 37/64 : 0
+lshn-qs-g2ri-2:96977:98429 [2] NCCL INFO Channel 38/64 : 0
+lshn-qs-g2ri-2:96977:98429 [2] NCCL INFO Channel 39/64 : 0
+lshn-qs-g2ri-2:96977:98429 [2] NCCL INFO Channel 40/64 : 0
+lshn-qs-g2ri-2:96977:98429 [2] NCCL INFO Channel 41/64 : 0
+lshn-qs-g2ri-2:96977:98429 [2] NCCL INFO Channel 42/64 : 0
+lshn-qs-g2ri-2:96977:98429 [2] NCCL INFO Channel 43/64 : 0
+lshn-qs-g2ri-2:96977:98429 [2] NCCL INFO Channel 44/64 : 0
+lshn-qs-g2ri-2:96977:98429 [2] NCCL INFO Channel 45/64 : 0
+lshn-qs-g2ri-2:96977:98429 [2] NCCL INFO Channel 46/64 : 0
+lshn-qs-g2ri-2:96977:98429 [2] NCCL INFO Channel 47/64 : 0
+lshn-qs-g2ri-2:96977:98429 [2] NCCL INFO Channel 48/64 : 0
+lshn-qs-g2ri-2:96977:98429 [2] NCCL INFO Channel 49/64 : 0
+lshn-qs-g2ri-2:96977:98429 [2] NCCL INFO Channel 50/64 : 0
+lshn-qs-g2ri-2:96977:98429 [2] NCCL INFO Channel 51/64 : 0
+lshn-qs-g2ri-2:96977:98429 [2] NCCL INFO Channel 52/64 : 0
+lshn-qs-g2ri-2:96977:98429 [2] NCCL INFO Channel 53/64 : 0
+lshn-qs-g2ri-2:96977:98429 [2] NCCL INFO Channel 54/64 : 0
+lshn-qs-g2ri-2:96977:98429 [2] NCCL INFO Channel 55/64 : 0
+lshn-qs-g2ri-2:96977:98429 [2] NCCL INFO Channel 56/64 : 0
+lshn-qs-g2ri-2:96977:98429 [2] NCCL INFO Channel 57/64 : 0
+lshn-qs-g2ri-2:96977:98429 [2] NCCL INFO Channel 58/64 : 0
+lshn-qs-g2ri-2:96977:98429 [2] NCCL INFO Channel 59/64 : 0
+lshn-qs-g2ri-2:96977:98429 [2] NCCL INFO Channel 60/64 : 0
+lshn-qs-g2ri-2:96977:98429 [2] NCCL INFO Channel 61/64 : 0
+lshn-qs-g2ri-2:96977:98429 [2] NCCL INFO Channel 62/64 : 0
+lshn-qs-g2ri-2:96977:98429 [2] NCCL INFO Channel 63/64 : 0
+lshn-qs-g2ri-2:96977:98429 [2] NCCL INFO Trees [0] -1/-1/-1->0->-1 [1] -1/-1/-1->0->-1 [2] -1/-1/-1->0->-1 [3] -1/-1/-1->0->-1 [4] -1/-1/-1->0->-1 [5] -1/-1/-1->0->-1 [6] -1/-1/-1->0->-1 [7] -1/-1/-1->0->-1 [8] -1/-1/-1->0->-1 [9] -1/-1/-1->0->-1 [10] -1/-1/-1->0->-1 [11] -1/-1/-1->0->-1 [12] -1/-1/-1->0->-1 [13] -1/-1/-1->0->-1 [14] -1/-1/-1->0->-1 [15] -1/-1/-1->0->-1 [16] -1/-1/-1->0->-1 [17] -1/-1/-1->0->-1 [18] -1/-1/-1->0->-1 [19] -1/-1/-1->0->-1 [20] -1/-1/-1->0->-1 [21] -1/-1/-1->0->-1 [22] -1/-1/-1->0->-1 [23] -1/-1/-1->0->-1 [24] -1/-1/-1->0->-1 [25] -1/-1/-1->0->-1 [26] -1/-1/-1->0->-1 [27] -1/-1/-1->0->-1 [28] -1/-1/-1->0->-1 [29] -1/-1/-1->0->-1 [30] -1/-1/-1->0->-1 [31] -1/-1/-1->0->-1 [32] -1/-1/-1->0->-1 [33] -1/-1/-1->0->-1 [34] -1/-1/-1->0->-1 [35] -1/-1/-1->0->-1 [36] -1/-1/-1->0->-1 [37] -1/-1/-1->0->-1 [38] -1/-1/-1->0->-1 [39] -1/-1/-1->0->-1 [40] -1/-1/-1->0->-1 [41] -1/-1/-1->0->-1 [42] -1/-1/-1->0->-1 [43] -1/-1/-1->0->-1 [44] -1/-1/-1->0->-1 [45] -1/-1/-1->0->-1 [46] -1/-1/-1->0->-1
+lshn-qs-g2ri-2:96977:98429 [2] NCCL INFO P2P Chunksize set to 524288
+lshn-qs-g2ri-2:96977:98429 [2] NCCL INFO Check P2P Type isAllDirectP2p 0 directMode 0
+lshn-qs-g2ri-2:96977:98433 [2] NCCL INFO [Proxy Service] Device 2 CPU core 3
+lshn-qs-g2ri-2:96977:98434 [2] NCCL INFO [Proxy Service UDS] Device 2 CPU core 7
+lshn-qs-g2ri-2:96977:98429 [2] NCCL INFO 64 coll channels, 64 collnet channels, 0 nvls channels, 64 p2p channels, 64 p2p channels per peer
+lshn-qs-g2ri-2:96977:98429 [2] NCCL INFO CC Off, workFifoBytes 1048576
+lshn-qs-g2ri-2:96977:98429 [2] NCCL INFO ncclCommSplit comm 0x21d7e530 rank 0 nranks 1 cudaDev 2 nvmlDev 2 busId a2000 parent 0x1f1d4810 splitCount 27 color 1227022723 key 0 - Init COMPLETE
+lshn-qs-g2ri-2:96977:98429 [2] NCCL INFO Init timings - ncclCommSplit: rank 0 nranks 1 total 0.03 (kernels 0.00, alloc 0.00, bootstrap 0.00, allgathers 0.00, topo 0.01, graphs 0.00, connections 0.02, rest 0.00)
+[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
+lshn-qs-g2ri-2:96977:96977 [2] NCCL INFO Rank 2 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-g2ri-2:96977:96977 [2] NCCL INFO Rank 2 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-g2ri-2:96975:96975 [0] NCCL INFO Rank 0 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-g2ri-2:96976:96976 [1] NCCL INFO Rank 1 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-g2ri-2:96978:96978 [3] NCCL INFO Comm config Blocking set to 1
+lshn-qs-g2ri-2:96978:98444 [3] NCCL INFO Assigned NET plugin Socket to comm
+lshn-qs-g2ri-2:96978:98444 [3] NCCL INFO Using network Socket
+lshn-qs-g2ri-2:96977:96977 [2] NCCL INFO Rank 2 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-g2ri-2:96976:96976 [1] NCCL INFO Rank 1 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-g2ri-2:96978:98444 [3] NCCL INFO ncclCommSplit comm 0x20cddea0 rank 0 nranks 1 cudaDev 3 nvmlDev 3 busId c6000 parent 0x1e24ba00 splitCount 29 color 1301067556 key 0- Init START
+lshn-qs-g2ri-2:96975:96975 [0] NCCL INFO Comm config Blocking set to 1
+lshn-qs-g2ri-2:96978:98444 [3] NCCL INFO MNNVL busId 0xc6000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0
+lshn-qs-g2ri-2:96978:98444 [3] NCCL INFO Setting affinity for GPU 3 to 0-47,96-143
+lshn-qs-g2ri-2:96978:98444 [3] NCCL INFO comm 0x20cddea0 rank 0 nRanks 1 nNodes 1 localRanks 1 localRank 0 MNNVL 0
+lshn-qs-g2ri-2:96978:98444 [3] NCCL INFO Channel 00/64 : 0
+lshn-qs-g2ri-2:96978:98444 [3] NCCL INFO Channel 01/64 : 0
+lshn-qs-g2ri-2:96978:98444 [3] NCCL INFO Channel 02/64 : 0
+lshn-qs-g2ri-2:96978:98444 [3] NCCL INFO Channel 03/64 : 0
+lshn-qs-g2ri-2:96978:98444 [3] NCCL INFO Channel 04/64 : 0
+lshn-qs-g2ri-2:96978:98444 [3] NCCL INFO Channel 05/64 : 0
+lshn-qs-g2ri-2:96978:98444 [3] NCCL INFO Channel 06/64 : 0
+lshn-qs-g2ri-2:96978:98444 [3] NCCL INFO Channel 07/64 : 0
+lshn-qs-g2ri-2:96978:98444 [3] NCCL INFO Channel 08/64 : 0
+lshn-qs-g2ri-2:96978:98444 [3] NCCL INFO Channel 09/64 : 0
+lshn-qs-g2ri-2:96978:98444 [3] NCCL INFO Channel 10/64 : 0
+lshn-qs-g2ri-2:96978:98444 [3] NCCL INFO Channel 11/64 : 0
+lshn-qs-g2ri-2:96978:98444 [3] NCCL INFO Channel 12/64 : 0
+lshn-qs-g2ri-2:96978:98444 [3] NCCL INFO Channel 13/64 : 0
+lshn-qs-g2ri-2:96978:98444 [3] NCCL INFO Channel 14/64 : 0
+lshn-qs-g2ri-2:96978:98444 [3] NCCL INFO Channel 15/64 : 0
+lshn-qs-g2ri-2:96978:98444 [3] NCCL INFO Channel 16/64 : 0
+lshn-qs-g2ri-2:96978:98444 [3] NCCL INFO Channel 17/64 : 0
+lshn-qs-g2ri-2:96978:98444 [3] NCCL INFO Channel 18/64 : 0
+lshn-qs-g2ri-2:96978:98444 [3] NCCL INFO Channel 19/64 : 0
+lshn-qs-g2ri-2:96978:98444 [3] NCCL INFO Channel 20/64 : 0
+lshn-qs-g2ri-2:96978:98444 [3] NCCL INFO Channel 21/64 : 0
+lshn-qs-g2ri-2:96978:98444 [3] NCCL INFO Channel 22/64 : 0
+lshn-qs-g2ri-2:96978:98444 [3] NCCL INFO Channel 23/64 : 0
+lshn-qs-g2ri-2:96978:98444 [3] NCCL INFO Channel 24/64 : 0
+lshn-qs-g2ri-2:96978:98444 [3] NCCL INFO Channel 25/64 : 0
+lshn-qs-g2ri-2:96978:98444 [3] NCCL INFO Channel 26/64 : 0
+lshn-qs-g2ri-2:96978:98444 [3] NCCL INFO Channel 27/64 : 0
+lshn-qs-g2ri-2:96978:98444 [3] NCCL INFO Channel 28/64 : 0
+lshn-qs-g2ri-2:96978:98444 [3] NCCL INFO Channel 29/64 : 0
+lshn-qs-g2ri-2:96978:98444 [3] NCCL INFO Channel 30/64 : 0
+lshn-qs-g2ri-2:96978:98444 [3] NCCL INFO Channel 31/64 : 0
+lshn-qs-g2ri-2:96978:98444 [3] NCCL INFO Channel 32/64 : 0
+lshn-qs-g2ri-2:96978:98444 [3] NCCL INFO Channel 33/64 : 0
+lshn-qs-g2ri-2:96978:98444 [3] NCCL INFO Channel 34/64 : 0
+lshn-qs-g2ri-2:96978:98444 [3] NCCL INFO Channel 35/64 : 0
+lshn-qs-g2ri-2:96978:98444 [3] NCCL INFO Channel 36/64 : 0
+lshn-qs-g2ri-2:96978:98444 [3] NCCL INFO Channel 37/64 : 0
+lshn-qs-g2ri-2:96978:98444 [3] NCCL INFO Channel 38/64 : 0
+lshn-qs-g2ri-2:96978:98444 [3] NCCL INFO Channel 39/64 : 0
+lshn-qs-g2ri-2:96978:98444 [3] NCCL INFO Channel 40/64 : 0
+lshn-qs-g2ri-2:96978:98444 [3] NCCL INFO Channel 41/64 : 0
+lshn-qs-g2ri-2:96978:98444 [3] NCCL INFO Channel 42/64 : 0
+lshn-qs-g2ri-2:96978:98444 [3] NCCL INFO Channel 43/64 : 0
+lshn-qs-g2ri-2:96978:98444 [3] NCCL INFO Channel 44/64 : 0
+lshn-qs-g2ri-2:96978:98444 [3] NCCL INFO Channel 45/64 : 0
+lshn-qs-g2ri-2:96978:98444 [3] NCCL INFO Channel 46/64 : 0
+lshn-qs-g2ri-2:96978:98444 [3] NCCL INFO Channel 47/64 : 0
+lshn-qs-g2ri-2:96978:98444 [3] NCCL INFO Channel 48/64 : 0
+lshn-qs-g2ri-2:96978:98444 [3] NCCL INFO Channel 49/64 : 0
+lshn-qs-g2ri-2:96978:98444 [3] NCCL INFO Channel 50/64 : 0
+lshn-qs-g2ri-2:96978:98444 [3] NCCL INFO Channel 51/64 : 0
+lshn-qs-g2ri-2:96978:98444 [3] NCCL INFO Channel 52/64 : 0
+lshn-qs-g2ri-2:96978:98444 [3] NCCL INFO Channel 53/64 : 0
+lshn-qs-g2ri-2:96978:98444 [3] NCCL INFO Channel 54/64 : 0
+lshn-qs-g2ri-2:96978:98444 [3] NCCL INFO Channel 55/64 : 0
+lshn-qs-g2ri-2:96978:98444 [3] NCCL INFO Channel 56/64 : 0
+lshn-qs-g2ri-2:96978:98444 [3] NCCL INFO Channel 57/64 : 0
+lshn-qs-g2ri-2:96978:98444 [3] NCCL INFO Channel 58/64 : 0
+lshn-qs-g2ri-2:96978:98444 [3] NCCL INFO Channel 59/64 : 0
+lshn-qs-g2ri-2:96978:98444 [3] NCCL INFO Channel 60/64 : 0
+lshn-qs-g2ri-2:96978:98444 [3] NCCL INFO Channel 61/64 : 0
+lshn-qs-g2ri-2:96978:98444 [3] NCCL INFO Channel 62/64 : 0
+lshn-qs-g2ri-2:96978:98444 [3] NCCL INFO Channel 63/64 : 0
+lshn-qs-g2ri-2:96978:98444 [3] NCCL INFO Trees [0] -1/-1/-1->0->-1 [1] -1/-1/-1->0->-1 [2] -1/-1/-1->0->-1 [3] -1/-1/-1->0->-1 [4] -1/-1/-1->0->-1 [5] -1/-1/-1->0->-1 [6] -1/-1/-1->0->-1 [7] -1/-1/-1->0->-1 [8] -1/-1/-1->0->-1 [9] -1/-1/-1->0->-1 [10] -1/-1/-1->0->-1 [11] -1/-1/-1->0->-1 [12] -1/-1/-1->0->-1 [13] -1/-1/-1->0->-1 [14] -1/-1/-1->0->-1 [15] -1/-1/-1->0->-1 [16] -1/-1/-1->0->-1 [17] -1/-1/-1->0->-1 [18] -1/-1/-1->0->-1 [19] -1/-1/-1->0->-1 [20] -1/-1/-1->0->-1 [21] -1/-1/-1->0->-1 [22] -1/-1/-1->0->-1 [23] -1/-1/-1->0->-1 [24] -1/-1/-1->0->-1 [25] -1/-1/-1->0->-1 [26] -1/-1/-1->0->-1 [27] -1/-1/-1->0->-1 [28] -1/-1/-1->0->-1 [29] -1/-1/-1->0->-1 [30] -1/-1/-1->0->-1 [31] -1/-1/-1->0->-1 [32] -1/-1/-1->0->-1 [33] -1/-1/-1->0->-1 [34] -1/-1/-1->0->-1 [35] -1/-1/-1->0->-1 [36] -1/-1/-1->0->-1 [37] -1/-1/-1->0->-1 [38] -1/-1/-1->0->-1 [39] -1/-1/-1->0->-1 [40] -1/-1/-1->0->-1 [41] -1/-1/-1->0->-1 [42] -1/-1/-1->0->-1 [43] -1/-1/-1->0->-1 [44] -1/-1/-1->0->-1 [45] -1/-1/-1->0->-1 [46] -1/-1/-1->0->-1
+lshn-qs-g2ri-2:96978:98444 [3] NCCL INFO P2P Chunksize set to 524288
+lshn-qs-g2ri-2:96978:98444 [3] NCCL INFO Check P2P Type isAllDirectP2p 0 directMode 0
+lshn-qs-g2ri-2:96978:98450 [3] NCCL INFO [Proxy Service] Device 3 CPU core 101
+lshn-qs-g2ri-2:96978:98451 [3] NCCL INFO [Proxy Service UDS] Device 3 CPU core 19
+lshn-qs-g2ri-2:96978:98444 [3] NCCL INFO 64 coll channels, 64 collnet channels, 0 nvls channels, 64 p2p channels, 64 p2p channels per peer
+lshn-qs-g2ri-2:96978:98444 [3] NCCL INFO CC Off, workFifoBytes 1048576
+lshn-qs-g2ri-2:96978:98444 [3] NCCL INFO ncclCommSplit comm 0x20cddea0 rank 0 nranks 1 cudaDev 3 nvmlDev 3 busId c6000 parent 0x1e24ba00 splitCount 29 color 1301067556 key 0 - Init COMPLETE
+lshn-qs-g2ri-2:96978:98444 [3] NCCL INFO Init timings - ncclCommSplit: rank 0 nranks 1 total 0.03 (kernels 0.00, alloc 0.00, bootstrap 0.00, allgathers 0.00, topo 0.01, graphs 0.00, connections 0.02, rest 0.00)
+[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
+lshn-qs-g2ri-2:96978:96978 [3] NCCL INFO Rank 3 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-g2ri-2:96975:98449 [0] NCCL INFO Assigned NET plugin Socket to comm
+lshn-qs-g2ri-2:96975:98449 [0] NCCL INFO Using network Socket
+lshn-qs-g2ri-2:96978:96978 [3] NCCL INFO Rank 3 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-g2ri-2:96977:96977 [2] NCCL INFO Rank 2 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-g2ri-2:96976:96976 [1] NCCL INFO Rank 1 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-g2ri-2:96975:98449 [0] NCCL INFO ncclCommSplit comm 0x21171ad0 rank 0 nranks 1 cudaDev 0 nvmlDev 0 busId 8000 parent 0x1df75b30 splitCount 30 color 59908776 key 0- Init START
+lshn-qs-g2ri-2:96975:98449 [0] NCCL INFO MNNVL busId 0x8000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0
+lshn-qs-g2ri-2:96975:98449 [0] NCCL INFO Setting affinity for GPU 0 to 0-47,96-143
+lshn-qs-g2ri-2:96975:98449 [0] NCCL INFO comm 0x21171ad0 rank 0 nRanks 1 nNodes 1 localRanks 1 localRank 0 MNNVL 0
+lshn-qs-g2ri-2:96975:98449 [0] NCCL INFO Channel 00/64 : 0
+lshn-qs-g2ri-2:96975:98449 [0] NCCL INFO Channel 01/64 : 0
+lshn-qs-g2ri-2:96975:98449 [0] NCCL INFO Channel 02/64 : 0
+lshn-qs-g2ri-2:96975:98449 [0] NCCL INFO Channel 03/64 : 0
+lshn-qs-g2ri-2:96975:98449 [0] NCCL INFO Channel 04/64 : 0
+lshn-qs-g2ri-2:96975:98449 [0] NCCL INFO Channel 05/64 : 0
+lshn-qs-g2ri-2:96975:98449 [0] NCCL INFO Channel 06/64 : 0
+lshn-qs-g2ri-2:96975:98449 [0] NCCL INFO Channel 07/64 : 0
+lshn-qs-g2ri-2:96975:98449 [0] NCCL INFO Channel 08/64 : 0
+lshn-qs-g2ri-2:96975:98449 [0] NCCL INFO Channel 09/64 : 0
+lshn-qs-g2ri-2:96975:98449 [0] NCCL INFO Channel 10/64 : 0
+lshn-qs-g2ri-2:96975:98449 [0] NCCL INFO Channel 11/64 : 0
+lshn-qs-g2ri-2:96975:98449 [0] NCCL INFO Channel 12/64 : 0
+lshn-qs-g2ri-2:96975:98449 [0] NCCL INFO Channel 13/64 : 0
+lshn-qs-g2ri-2:96975:98449 [0] NCCL INFO Channel 14/64 : 0
+lshn-qs-g2ri-2:96975:98449 [0] NCCL INFO Channel 15/64 : 0
+lshn-qs-g2ri-2:96975:98449 [0] NCCL INFO Channel 16/64 : 0
+lshn-qs-g2ri-2:96975:98449 [0] NCCL INFO Channel 17/64 : 0
+lshn-qs-g2ri-2:96975:98449 [0] NCCL INFO Channel 18/64 : 0
+lshn-qs-g2ri-2:96975:98449 [0] NCCL INFO Channel 19/64 : 0
+lshn-qs-g2ri-2:96975:98449 [0] NCCL INFO Channel 20/64 : 0
+lshn-qs-g2ri-2:96975:98449 [0] NCCL INFO Channel 21/64 : 0
+lshn-qs-g2ri-2:96975:98449 [0] NCCL INFO Channel 22/64 : 0
+lshn-qs-g2ri-2:96975:98449 [0] NCCL INFO Channel 23/64 : 0
+lshn-qs-g2ri-2:96975:98449 [0] NCCL INFO Channel 24/64 : 0
+lshn-qs-g2ri-2:96975:98449 [0] NCCL INFO Channel 25/64 : 0
+lshn-qs-g2ri-2:96975:98449 [0] NCCL INFO Channel 26/64 : 0
+lshn-qs-g2ri-2:96975:98449 [0] NCCL INFO Channel 27/64 : 0
+lshn-qs-g2ri-2:96975:98449 [0] NCCL INFO Channel 28/64 : 0
+lshn-qs-g2ri-2:96975:98449 [0] NCCL INFO Channel 29/64 : 0
+lshn-qs-g2ri-2:96975:98449 [0] NCCL INFO Channel 30/64 : 0
+lshn-qs-g2ri-2:96975:98449 [0] NCCL INFO Channel 31/64 : 0
+lshn-qs-g2ri-2:96975:98449 [0] NCCL INFO Channel 32/64 : 0
+lshn-qs-g2ri-2:96975:98449 [0] NCCL INFO Channel 33/64 : 0
+lshn-qs-g2ri-2:96975:98449 [0] NCCL INFO Channel 34/64 : 0
+lshn-qs-g2ri-2:96975:98449 [0] NCCL INFO Channel 35/64 : 0
+lshn-qs-g2ri-2:96975:98449 [0] NCCL INFO Channel 36/64 : 0
+lshn-qs-g2ri-2:96975:98449 [0] NCCL INFO Channel 37/64 : 0
+lshn-qs-g2ri-2:96975:98449 [0] NCCL INFO Channel 38/64 : 0
+lshn-qs-g2ri-2:96975:98449 [0] NCCL INFO Channel 39/64 : 0
+lshn-qs-g2ri-2:96975:98449 [0] NCCL INFO Channel 40/64 : 0
+lshn-qs-g2ri-2:96975:98449 [0] NCCL INFO Channel 41/64 : 0
+lshn-qs-g2ri-2:96975:98449 [0] NCCL INFO Channel 42/64 : 0
+lshn-qs-g2ri-2:96975:98449 [0] NCCL INFO Channel 43/64 : 0
+lshn-qs-g2ri-2:96975:98449 [0] NCCL INFO Channel 44/64 : 0
+lshn-qs-g2ri-2:96975:98449 [0] NCCL INFO Channel 45/64 : 0
+lshn-qs-g2ri-2:96975:98449 [0] NCCL INFO Channel 46/64 : 0
+lshn-qs-g2ri-2:96975:98449 [0] NCCL INFO Channel 47/64 : 0
+lshn-qs-g2ri-2:96975:98449 [0] NCCL INFO Channel 48/64 : 0
+lshn-qs-g2ri-2:96975:98449 [0] NCCL INFO Channel 49/64 : 0
+lshn-qs-g2ri-2:96975:98449 [0] NCCL INFO Channel 50/64 : 0
+lshn-qs-g2ri-2:96975:98449 [0] NCCL INFO Channel 51/64 : 0
+lshn-qs-g2ri-2:96975:98449 [0] NCCL INFO Channel 52/64 : 0
+lshn-qs-g2ri-2:96975:98449 [0] NCCL INFO Channel 53/64 : 0
+lshn-qs-g2ri-2:96975:98449 [0] NCCL INFO Channel 54/64 : 0
+lshn-qs-g2ri-2:96975:98449 [0] NCCL INFO Channel 55/64 : 0
+lshn-qs-g2ri-2:96975:98449 [0] NCCL INFO Channel 56/64 : 0
+lshn-qs-g2ri-2:96975:98449 [0] NCCL INFO Channel 57/64 : 0
+lshn-qs-g2ri-2:96975:98449 [0] NCCL INFO Channel 58/64 : 0
+lshn-qs-g2ri-2:96975:98449 [0] NCCL INFO Channel 59/64 : 0
+lshn-qs-g2ri-2:96975:98449 [0] NCCL INFO Channel 60/64 : 0
+lshn-qs-g2ri-2:96975:98449 [0] NCCL INFO Channel 61/64 : 0
+lshn-qs-g2ri-2:96975:98449 [0] NCCL INFO Channel 62/64 : 0
+lshn-qs-g2ri-2:96975:98449 [0] NCCL INFO Channel 63/64 : 0
+lshn-qs-g2ri-2:96975:98449 [0] NCCL INFO Trees [0] -1/-1/-1->0->-1 [1] -1/-1/-1->0->-1 [2] -1/-1/-1->0->-1 [3] -1/-1/-1->0->-1 [4] -1/-1/-1->0->-1 [5] -1/-1/-1->0->-1 [6] -1/-1/-1->0->-1 [7] -1/-1/-1->0->-1 [8] -1/-1/-1->0->-1 [9] -1/-1/-1->0->-1 [10] -1/-1/-1->0->-1 [11] -1/-1/-1->0->-1 [12] -1/-1/-1->0->-1 [13] -1/-1/-1->0->-1 [14] -1/-1/-1->0->-1 [15] -1/-1/-1->0->-1 [16] -1/-1/-1->0->-1 [17] -1/-1/-1->0->-1 [18] -1/-1/-1->0->-1 [19] -1/-1/-1->0->-1 [20] -1/-1/-1->0->-1 [21] -1/-1/-1->0->-1 [22] -1/-1/-1->0->-1 [23] -1/-1/-1->0->-1 [24] -1/-1/-1->0->-1 [25] -1/-1/-1->0->-1 [26] -1/-1/-1->0->-1 [27] -1/-1/-1->0->-1 [28] -1/-1/-1->0->-1 [29] -1/-1/-1->0->-1 [30] -1/-1/-1->0->-1 [31] -1/-1/-1->0->-1 [32] -1/-1/-1->0->-1 [33] -1/-1/-1->0->-1 [34] -1/-1/-1->0->-1 [35] -1/-1/-1->0->-1 [36] -1/-1/-1->0->-1 [37] -1/-1/-1->0->-1 [38] -1/-1/-1->0->-1 [39] -1/-1/-1->0->-1 [40] -1/-1/-1->0->-1 [41] -1/-1/-1->0->-1 [42] -1/-1/-1->0->-1 [43] -1/-1/-1->0->-1 [44] -1/-1/-1->0->-1 [45] -1/-1/-1->0->-1 [46] -1/-1/-1->0->-1
+lshn-qs-g2ri-2:96975:98449 [0] NCCL INFO P2P Chunksize set to 524288
+lshn-qs-g2ri-2:96975:98449 [0] NCCL INFO Check P2P Type isAllDirectP2p 0 directMode 0
+lshn-qs-g2ri-2:96975:98459 [0] NCCL INFO [Proxy Service] Device 0 CPU core 140
+lshn-qs-g2ri-2:96975:98460 [0] NCCL INFO [Proxy Service UDS] Device 0 CPU core 105
+lshn-qs-g2ri-2:96975:98449 [0] NCCL INFO 64 coll channels, 64 collnet channels, 0 nvls channels, 64 p2p channels, 64 p2p channels per peer
+lshn-qs-g2ri-2:96975:98449 [0] NCCL INFO CC Off, workFifoBytes 1048576
+lshn-qs-g2ri-2:96975:98449 [0] NCCL INFO ncclCommSplit comm 0x21171ad0 rank 0 nranks 1 cudaDev 0 nvmlDev 0 busId 8000 parent 0x1df75b30 splitCount 30 color 59908776 key 0 - Init COMPLETE
+lshn-qs-g2ri-2:96975:98449 [0] NCCL INFO Init timings - ncclCommSplit: rank 0 nranks 1 total 0.07 (kernels 0.00, alloc 0.00, bootstrap 0.00, allgathers 0.00, topo 0.01, graphs 0.00, connections 0.02, rest 0.04)
+[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
+lshn-qs-g2ri-2:96975:96975 [0] NCCL INFO Rank 0 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-g2ri-2:96975:96975 [0] NCCL INFO Rank 0 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-g2ri-2:96978:96978 [3] NCCL INFO Rank 3 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-g2ri-2:96977:96977 [2] NCCL INFO Rank 2 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-g2ri-2:96976:96976 [1] NCCL INFO Comm config Blocking set to 1
+lshn-qs-g2ri-2:96976:98470 [1] NCCL INFO Assigned NET plugin Socket to comm
+lshn-qs-g2ri-2:96976:98470 [1] NCCL INFO Using network Socket
+lshn-qs-g2ri-2:96975:96975 [0] NCCL INFO Rank 0 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-g2ri-2:96978:96978 [3] NCCL INFO Rank 3 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-g2ri-2:96977:96977 [2] NCCL INFO Rank 2 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-g2ri-2:96976:98470 [1] NCCL INFO ncclCommSplit comm 0x208089f0 rank 0 nranks 1 cudaDev 1 nvmlDev 1 busId 7e000 parent 0x1dbf3b00 splitCount 32 color 440515407 key 0- Init START
+lshn-qs-g2ri-2:96976:98470 [1] NCCL INFO MNNVL busId 0x7e000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0
+lshn-qs-g2ri-2:96976:98470 [1] NCCL INFO Setting affinity for GPU 1 to 0-47,96-143
+lshn-qs-g2ri-2:96976:98470 [1] NCCL INFO comm 0x208089f0 rank 0 nRanks 1 nNodes 1 localRanks 1 localRank 0 MNNVL 0
+lshn-qs-g2ri-2:96976:98470 [1] NCCL INFO Channel 00/64 : 0
+lshn-qs-g2ri-2:96976:98470 [1] NCCL INFO Channel 01/64 : 0
+lshn-qs-g2ri-2:96976:98470 [1] NCCL INFO Channel 02/64 : 0
+lshn-qs-g2ri-2:96976:98470 [1] NCCL INFO Channel 03/64 : 0
+lshn-qs-g2ri-2:96976:98470 [1] NCCL INFO Channel 04/64 : 0
+lshn-qs-g2ri-2:96976:98470 [1] NCCL INFO Channel 05/64 : 0
+lshn-qs-g2ri-2:96976:98470 [1] NCCL INFO Channel 06/64 : 0
+lshn-qs-g2ri-2:96976:98470 [1] NCCL INFO Channel 07/64 : 0
+lshn-qs-g2ri-2:96976:98470 [1] NCCL INFO Channel 08/64 : 0
+lshn-qs-g2ri-2:96976:98470 [1] NCCL INFO Channel 09/64 : 0
+lshn-qs-g2ri-2:96976:98470 [1] NCCL INFO Channel 10/64 : 0
+lshn-qs-g2ri-2:96976:98470 [1] NCCL INFO Channel 11/64 : 0
+lshn-qs-g2ri-2:96976:98470 [1] NCCL INFO Channel 12/64 : 0
+lshn-qs-g2ri-2:96976:98470 [1] NCCL INFO Channel 13/64 : 0
+lshn-qs-g2ri-2:96976:98470 [1] NCCL INFO Channel 14/64 : 0
+lshn-qs-g2ri-2:96976:98470 [1] NCCL INFO Channel 15/64 : 0
+lshn-qs-g2ri-2:96976:98470 [1] NCCL INFO Channel 16/64 : 0
+lshn-qs-g2ri-2:96976:98470 [1] NCCL INFO Channel 17/64 : 0
+lshn-qs-g2ri-2:96976:98470 [1] NCCL INFO Channel 18/64 : 0
+lshn-qs-g2ri-2:96976:98470 [1] NCCL INFO Channel 19/64 : 0
+lshn-qs-g2ri-2:96976:98470 [1] NCCL INFO Channel 20/64 : 0
+lshn-qs-g2ri-2:96976:98470 [1] NCCL INFO Channel 21/64 : 0
+lshn-qs-g2ri-2:96976:98470 [1] NCCL INFO Channel 22/64 : 0
+lshn-qs-g2ri-2:96976:98470 [1] NCCL INFO Channel 23/64 : 0
+lshn-qs-g2ri-2:96976:98470 [1] NCCL INFO Channel 24/64 : 0
+lshn-qs-g2ri-2:96976:98470 [1] NCCL INFO Channel 25/64 : 0
+lshn-qs-g2ri-2:96976:98470 [1] NCCL INFO Channel 26/64 : 0
+lshn-qs-g2ri-2:96976:98470 [1] NCCL INFO Channel 27/64 : 0
+lshn-qs-g2ri-2:96976:98470 [1] NCCL INFO Channel 28/64 : 0
+lshn-qs-g2ri-2:96976:98470 [1] NCCL INFO Channel 29/64 : 0
+lshn-qs-g2ri-2:96976:98470 [1] NCCL INFO Channel 30/64 : 0
+lshn-qs-g2ri-2:96976:98470 [1] NCCL INFO Channel 31/64 : 0
+lshn-qs-g2ri-2:96976:98470 [1] NCCL INFO Channel 32/64 : 0
+lshn-qs-g2ri-2:96976:98470 [1] NCCL INFO Channel 33/64 : 0
+lshn-qs-g2ri-2:96976:98470 [1] NCCL INFO Channel 34/64 : 0
+lshn-qs-g2ri-2:96976:98470 [1] NCCL INFO Channel 35/64 : 0
+lshn-qs-g2ri-2:96976:98470 [1] NCCL INFO Channel 36/64 : 0
+lshn-qs-g2ri-2:96976:98470 [1] NCCL INFO Channel 37/64 : 0
+lshn-qs-g2ri-2:96976:98470 [1] NCCL INFO Channel 38/64 : 0
+lshn-qs-g2ri-2:96976:98470 [1] NCCL INFO Channel 39/64 : 0
+lshn-qs-g2ri-2:96976:98470 [1] NCCL INFO Channel 40/64 : 0
+lshn-qs-g2ri-2:96976:98470 [1] NCCL INFO Channel 41/64 : 0
+lshn-qs-g2ri-2:96976:98470 [1] NCCL INFO Channel 42/64 : 0
+lshn-qs-g2ri-2:96976:98470 [1] NCCL INFO Channel 43/64 : 0
+lshn-qs-g2ri-2:96976:98470 [1] NCCL INFO Channel 44/64 : 0
+lshn-qs-g2ri-2:96976:98470 [1] NCCL INFO Channel 45/64 : 0
+lshn-qs-g2ri-2:96976:98470 [1] NCCL INFO Channel 46/64 : 0
+lshn-qs-g2ri-2:96976:98470 [1] NCCL INFO Channel 47/64 : 0
+lshn-qs-g2ri-2:96976:98470 [1] NCCL INFO Channel 48/64 : 0
+lshn-qs-g2ri-2:96976:98470 [1] NCCL INFO Channel 49/64 : 0
+lshn-qs-g2ri-2:96976:98470 [1] NCCL INFO Channel 50/64 : 0
+lshn-qs-g2ri-2:96976:98470 [1] NCCL INFO Channel 51/64 : 0
+lshn-qs-g2ri-2:96976:98470 [1] NCCL INFO Channel 52/64 : 0
+lshn-qs-g2ri-2:96976:98470 [1] NCCL INFO Channel 53/64 : 0
+lshn-qs-g2ri-2:96976:98470 [1] NCCL INFO Channel 54/64 : 0
+lshn-qs-g2ri-2:96976:98470 [1] NCCL INFO Channel 55/64 : 0
+lshn-qs-g2ri-2:96976:98470 [1] NCCL INFO Channel 56/64 : 0
+lshn-qs-g2ri-2:96976:98470 [1] NCCL INFO Channel 57/64 : 0
+lshn-qs-g2ri-2:96976:98470 [1] NCCL INFO Channel 58/64 : 0
+lshn-qs-g2ri-2:96976:98470 [1] NCCL INFO Channel 59/64 : 0
+lshn-qs-g2ri-2:96976:98470 [1] NCCL INFO Channel 60/64 : 0
+lshn-qs-g2ri-2:96976:98470 [1] NCCL INFO Channel 61/64 : 0
+lshn-qs-g2ri-2:96976:98470 [1] NCCL INFO Channel 62/64 : 0
+lshn-qs-g2ri-2:96976:98470 [1] NCCL INFO Channel 63/64 : 0
+lshn-qs-g2ri-2:96976:98470 [1] NCCL INFO Trees [0] -1/-1/-1->0->-1 [1] -1/-1/-1->0->-1 [2] -1/-1/-1->0->-1 [3] -1/-1/-1->0->-1 [4] -1/-1/-1->0->-1 [5] -1/-1/-1->0->-1 [6] -1/-1/-1->0->-1 [7] -1/-1/-1->0->-1 [8] -1/-1/-1->0->-1 [9] -1/-1/-1->0->-1 [10] -1/-1/-1->0->-1 [11] -1/-1/-1->0->-1 [12] -1/-1/-1->0->-1 [13] -1/-1/-1->0->-1 [14] -1/-1/-1->0->-1 [15] -1/-1/-1->0->-1 [16] -1/-1/-1->0->-1 [17] -1/-1/-1->0->-1 [18] -1/-1/-1->0->-1 [19] -1/-1/-1->0->-1 [20] -1/-1/-1->0->-1 [21] -1/-1/-1->0->-1 [22] -1/-1/-1->0->-1 [23] -1/-1/-1->0->-1 [24] -1/-1/-1->0->-1 [25] -1/-1/-1->0->-1 [26] -1/-1/-1->0->-1 [27] -1/-1/-1->0->-1 [28] -1/-1/-1->0->-1 [29] -1/-1/-1->0->-1 [30] -1/-1/-1->0->-1 [31] -1/-1/-1->0->-1 [32] -1/-1/-1->0->-1 [33] -1/-1/-1->0->-1 [34] -1/-1/-1->0->-1 [35] -1/-1/-1->0->-1 [36] -1/-1/-1->0->-1 [37] -1/-1/-1->0->-1 [38] -1/-1/-1->0->-1 [39] -1/-1/-1->0->-1 [40] -1/-1/-1->0->-1 [41] -1/-1/-1->0->-1 [42] -1/-1/-1->0->-1 [43] -1/-1/-1->0->-1 [44] -1/-1/-1->0->-1 [45] -1/-1/-1->0->-1 [46] -1/-1/-1->0->-1
+lshn-qs-g2ri-2:96976:98470 [1] NCCL INFO P2P Chunksize set to 524288
+lshn-qs-g2ri-2:96976:98470 [1] NCCL INFO Check P2P Type isAllDirectP2p 0 directMode 0
+lshn-qs-g2ri-2:96976:98474 [1] NCCL INFO [Proxy Service] Device 1 CPU core 139
+lshn-qs-g2ri-2:96976:98475 [1] NCCL INFO [Proxy Service UDS] Device 1 CPU core 96
+lshn-qs-g2ri-2:96976:98470 [1] NCCL INFO 64 coll channels, 64 collnet channels, 0 nvls channels, 64 p2p channels, 64 p2p channels per peer
+lshn-qs-g2ri-2:96976:98470 [1] NCCL INFO CC Off, workFifoBytes 1048576
+lshn-qs-g2ri-2:96976:98470 [1] NCCL INFO ncclCommSplit comm 0x208089f0 rank 0 nranks 1 cudaDev 1 nvmlDev 1 busId 7e000 parent 0x1dbf3b00 splitCount 32 color 440515407 key 0 - Init COMPLETE
+lshn-qs-g2ri-2:96976:98470 [1] NCCL INFO Init timings - ncclCommSplit: rank 0 nranks 1 total 0.03 (kernels 0.00, alloc 0.00, bootstrap 0.00, allgathers 0.00, topo 0.01, graphs 0.00, connections 0.02, rest 0.00)
+[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
+lshn-qs-g2ri-2:96976:96976 [1] NCCL INFO Rank 1 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-g2ri-2:96978:96978 [3] NCCL INFO Rank 3 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-g2ri-2:96976:96976 [1] NCCL INFO Rank 1 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-g2ri-2:96975:96975 [0] NCCL INFO Rank 0 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-g2ri-2:96977:96977 [2] NCCL INFO Comm config Blocking set to 1
+lshn-qs-g2ri-2:96977:98485 [2] NCCL INFO Assigned NET plugin Socket to comm
+lshn-qs-g2ri-2:96975:96975 [0] NCCL INFO Rank 0 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-g2ri-2:96976:96976 [1] NCCL INFO Rank 1 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-g2ri-2:96978:96978 [3] NCCL INFO Rank 3 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-g2ri-2:96977:98485 [2] NCCL INFO Using network Socket
+lshn-qs-g2ri-2:96977:98485 [2] NCCL INFO ncclCommSplit comm 0x21e86140 rank 0 nranks 1 cudaDev 2 nvmlDev 2 busId a2000 parent 0x1f1d4810 splitCount 34 color 1227022723 key 0- Init START
+lshn-qs-g2ri-2:96977:98485 [2] NCCL INFO MNNVL busId 0xa2000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0
+lshn-qs-g2ri-2:96977:98485 [2] NCCL INFO Setting affinity for GPU 2 to 0-47,96-143
+lshn-qs-g2ri-2:96977:98485 [2] NCCL INFO comm 0x21e86140 rank 0 nRanks 1 nNodes 1 localRanks 1 localRank 0 MNNVL 0
+lshn-qs-g2ri-2:96977:98485 [2] NCCL INFO Channel 00/64 : 0
+lshn-qs-g2ri-2:96977:98485 [2] NCCL INFO Channel 01/64 : 0
+lshn-qs-g2ri-2:96977:98485 [2] NCCL INFO Channel 02/64 : 0
+lshn-qs-g2ri-2:96977:98485 [2] NCCL INFO Channel 03/64 : 0
+lshn-qs-g2ri-2:96977:98485 [2] NCCL INFO Channel 04/64 : 0
+lshn-qs-g2ri-2:96977:98485 [2] NCCL INFO Channel 05/64 : 0
+lshn-qs-g2ri-2:96977:98485 [2] NCCL INFO Channel 06/64 : 0
+lshn-qs-g2ri-2:96977:98485 [2] NCCL INFO Channel 07/64 : 0
+lshn-qs-g2ri-2:96977:98485 [2] NCCL INFO Channel 08/64 : 0
+lshn-qs-g2ri-2:96977:98485 [2] NCCL INFO Channel 09/64 : 0
+lshn-qs-g2ri-2:96977:98485 [2] NCCL INFO Channel 10/64 : 0
+lshn-qs-g2ri-2:96977:98485 [2] NCCL INFO Channel 11/64 : 0
+lshn-qs-g2ri-2:96977:98485 [2] NCCL INFO Channel 12/64 : 0
+lshn-qs-g2ri-2:96977:98485 [2] NCCL INFO Channel 13/64 : 0
+lshn-qs-g2ri-2:96977:98485 [2] NCCL INFO Channel 14/64 : 0
+lshn-qs-g2ri-2:96977:98485 [2] NCCL INFO Channel 15/64 : 0
+lshn-qs-g2ri-2:96977:98485 [2] NCCL INFO Channel 16/64 : 0
+lshn-qs-g2ri-2:96977:98485 [2] NCCL INFO Channel 17/64 : 0
+lshn-qs-g2ri-2:96977:98485 [2] NCCL INFO Channel 18/64 : 0
+lshn-qs-g2ri-2:96977:98485 [2] NCCL INFO Channel 19/64 : 0
+lshn-qs-g2ri-2:96977:98485 [2] NCCL INFO Channel 20/64 : 0
+lshn-qs-g2ri-2:96977:98485 [2] NCCL INFO Channel 21/64 : 0
+lshn-qs-g2ri-2:96977:98485 [2] NCCL INFO Channel 22/64 : 0
+lshn-qs-g2ri-2:96977:98485 [2] NCCL INFO Channel 23/64 : 0
+lshn-qs-g2ri-2:96977:98485 [2] NCCL INFO Channel 24/64 : 0
+lshn-qs-g2ri-2:96977:98485 [2] NCCL INFO Channel 25/64 : 0
+lshn-qs-g2ri-2:96977:98485 [2] NCCL INFO Channel 26/64 : 0
+lshn-qs-g2ri-2:96977:98485 [2] NCCL INFO Channel 27/64 : 0
+lshn-qs-g2ri-2:96977:98485 [2] NCCL INFO Channel 28/64 : 0
+lshn-qs-g2ri-2:96977:98485 [2] NCCL INFO Channel 29/64 : 0
+lshn-qs-g2ri-2:96977:98485 [2] NCCL INFO Channel 30/64 : 0
+lshn-qs-g2ri-2:96977:98485 [2] NCCL INFO Channel 31/64 : 0
+lshn-qs-g2ri-2:96977:98485 [2] NCCL INFO Channel 32/64 : 0
+lshn-qs-g2ri-2:96977:98485 [2] NCCL INFO Channel 33/64 : 0
+lshn-qs-g2ri-2:96977:98485 [2] NCCL INFO Channel 34/64 : 0
+lshn-qs-g2ri-2:96977:98485 [2] NCCL INFO Channel 35/64 : 0
+lshn-qs-g2ri-2:96977:98485 [2] NCCL INFO Channel 36/64 : 0
+lshn-qs-g2ri-2:96977:98485 [2] NCCL INFO Channel 37/64 : 0
+lshn-qs-g2ri-2:96977:98485 [2] NCCL INFO Channel 38/64 : 0
+lshn-qs-g2ri-2:96977:98485 [2] NCCL INFO Channel 39/64 : 0
+lshn-qs-g2ri-2:96977:98485 [2] NCCL INFO Channel 40/64 : 0
+lshn-qs-g2ri-2:96977:98485 [2] NCCL INFO Channel 41/64 : 0
+lshn-qs-g2ri-2:96977:98485 [2] NCCL INFO Channel 42/64 : 0
+lshn-qs-g2ri-2:96977:98485 [2] NCCL INFO Channel 43/64 : 0
+lshn-qs-g2ri-2:96977:98485 [2] NCCL INFO Channel 44/64 : 0
+lshn-qs-g2ri-2:96977:98485 [2] NCCL INFO Channel 45/64 : 0
+lshn-qs-g2ri-2:96977:98485 [2] NCCL INFO Channel 46/64 : 0
+lshn-qs-g2ri-2:96977:98485 [2] NCCL INFO Channel 47/64 : 0
+lshn-qs-g2ri-2:96977:98485 [2] NCCL INFO Channel 48/64 : 0
+lshn-qs-g2ri-2:96977:98485 [2] NCCL INFO Channel 49/64 : 0
+lshn-qs-g2ri-2:96977:98485 [2] NCCL INFO Channel 50/64 : 0
+lshn-qs-g2ri-2:96977:98485 [2] NCCL INFO Channel 51/64 : 0
+lshn-qs-g2ri-2:96977:98485 [2] NCCL INFO Channel 52/64 : 0
+lshn-qs-g2ri-2:96977:98485 [2] NCCL INFO Channel 53/64 : 0
+lshn-qs-g2ri-2:96977:98485 [2] NCCL INFO Channel 54/64 : 0
+lshn-qs-g2ri-2:96977:98485 [2] NCCL INFO Channel 55/64 : 0
+lshn-qs-g2ri-2:96977:98485 [2] NCCL INFO Channel 56/64 : 0
+lshn-qs-g2ri-2:96977:98485 [2] NCCL INFO Channel 57/64 : 0
+lshn-qs-g2ri-2:96977:98485 [2] NCCL INFO Channel 58/64 : 0
+lshn-qs-g2ri-2:96977:98485 [2] NCCL INFO Channel 59/64 : 0
+lshn-qs-g2ri-2:96977:98485 [2] NCCL INFO Channel 60/64 : 0
+lshn-qs-g2ri-2:96977:98485 [2] NCCL INFO Channel 61/64 : 0
+lshn-qs-g2ri-2:96977:98485 [2] NCCL INFO Channel 62/64 : 0
+lshn-qs-g2ri-2:96977:98485 [2] NCCL INFO Channel 63/64 : 0
+lshn-qs-g2ri-2:96977:98485 [2] NCCL INFO Trees [0] -1/-1/-1->0->-1 [1] -1/-1/-1->0->-1 [2] -1/-1/-1->0->-1 [3] -1/-1/-1->0->-1 [4] -1/-1/-1->0->-1 [5] -1/-1/-1->0->-1 [6] -1/-1/-1->0->-1 [7] -1/-1/-1->0->-1 [8] -1/-1/-1->0->-1 [9] -1/-1/-1->0->-1 [10] -1/-1/-1->0->-1 [11] -1/-1/-1->0->-1 [12] -1/-1/-1->0->-1 [13] -1/-1/-1->0->-1 [14] -1/-1/-1->0->-1 [15] -1/-1/-1->0->-1 [16] -1/-1/-1->0->-1 [17] -1/-1/-1->0->-1 [18] -1/-1/-1->0->-1 [19] -1/-1/-1->0->-1 [20] -1/-1/-1->0->-1 [21] -1/-1/-1->0->-1 [22] -1/-1/-1->0->-1 [23] -1/-1/-1->0->-1 [24] -1/-1/-1->0->-1 [25] -1/-1/-1->0->-1 [26] -1/-1/-1->0->-1 [27] -1/-1/-1->0->-1 [28] -1/-1/-1->0->-1 [29] -1/-1/-1->0->-1 [30] -1/-1/-1->0->-1 [31] -1/-1/-1->0->-1 [32] -1/-1/-1->0->-1 [33] -1/-1/-1->0->-1 [34] -1/-1/-1->0->-1 [35] -1/-1/-1->0->-1 [36] -1/-1/-1->0->-1 [37] -1/-1/-1->0->-1 [38] -1/-1/-1->0->-1 [39] -1/-1/-1->0->-1 [40] -1/-1/-1->0->-1 [41] -1/-1/-1->0->-1 [42] -1/-1/-1->0->-1 [43] -1/-1/-1->0->-1 [44] -1/-1/-1->0->-1 [45] -1/-1/-1->0->-1 [46] -1/-1/-1->0->-1
+lshn-qs-g2ri-2:96977:98485 [2] NCCL INFO P2P Chunksize set to 524288
+lshn-qs-g2ri-2:96977:98485 [2] NCCL INFO Check P2P Type isAllDirectP2p 0 directMode 0
+lshn-qs-g2ri-2:96977:98489 [2] NCCL INFO [Proxy Service] Device 2 CPU core 32
+lshn-qs-g2ri-2:96977:98490 [2] NCCL INFO [Proxy Service UDS] Device 2 CPU core 27
+lshn-qs-g2ri-2:96977:98485 [2] NCCL INFO 64 coll channels, 64 collnet channels, 0 nvls channels, 64 p2p channels, 64 p2p channels per peer
+lshn-qs-g2ri-2:96977:98485 [2] NCCL INFO CC Off, workFifoBytes 1048576
+lshn-qs-g2ri-2:96977:98485 [2] NCCL INFO ncclCommSplit comm 0x21e86140 rank 0 nranks 1 cudaDev 2 nvmlDev 2 busId a2000 parent 0x1f1d4810 splitCount 34 color 1227022723 key 0 - Init COMPLETE
+lshn-qs-g2ri-2:96977:98485 [2] NCCL INFO Init timings - ncclCommSplit: rank 0 nranks 1 total 0.03 (kernels 0.00, alloc 0.00, bootstrap 0.00, allgathers 0.00, topo 0.01, graphs 0.00, connections 0.02, rest 0.00)
+[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
+lshn-qs-g2ri-2:96977:96977 [2] NCCL INFO Rank 2 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-g2ri-2:96975:96975 [0] NCCL INFO Rank 0 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-g2ri-2:96976:96976 [1] NCCL INFO Rank 1 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-g2ri-2:96977:96977 [2] NCCL INFO Rank 2 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-g2ri-2:96978:96978 [3] NCCL INFO Comm config Blocking set to 1
+lshn-qs-g2ri-2:96978:98500 [3] NCCL INFO Assigned NET plugin Socket to comm
+lshn-qs-g2ri-2:96978:98500 [3] NCCL INFO Using network Socket
+INFO 12-02 00:13:11 [parallel_state.py:1165] rank 1 in world size 4 is assigned as DP rank 0, PP rank 0, TP rank 0, EP rank 0
+INFO 12-02 00:13:11 [parallel_state.py:1165] rank 0 in world size 4 is assigned as DP rank 0, PP rank 0, TP rank 0, EP rank 0
+lshn-qs-g2ri-2:96978:98500 [3] NCCL INFO ncclCommSplit comm 0x20de5ab0 rank 0 nranks 1 cudaDev 3 nvmlDev 3 busId c6000 parent 0x1e24ba00 splitCount 36 color 1301067556 key 0- Init START
+INFO 12-02 00:13:11 [parallel_state.py:1165] rank 2 in world size 4 is assigned as DP rank 0, PP rank 0, TP rank 0, EP rank 0
+lshn-qs-g2ri-2:96978:98500 [3] NCCL INFO MNNVL busId 0xc6000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0
+lshn-qs-g2ri-2:96978:98500 [3] NCCL INFO Setting affinity for GPU 3 to 0-47,96-143
+lshn-qs-g2ri-2:96978:98500 [3] NCCL INFO comm 0x20de5ab0 rank 0 nRanks 1 nNodes 1 localRanks 1 localRank 0 MNNVL 0
+lshn-qs-g2ri-2:96978:98500 [3] NCCL INFO Channel 00/64 : 0
+lshn-qs-g2ri-2:96978:98500 [3] NCCL INFO Channel 01/64 : 0
+lshn-qs-g2ri-2:96978:98500 [3] NCCL INFO Channel 02/64 : 0
+lshn-qs-g2ri-2:96978:98500 [3] NCCL INFO Channel 03/64 : 0
+lshn-qs-g2ri-2:96978:98500 [3] NCCL INFO Channel 04/64 : 0
+lshn-qs-g2ri-2:96978:98500 [3] NCCL INFO Channel 05/64 : 0
+lshn-qs-g2ri-2:96978:98500 [3] NCCL INFO Channel 06/64 : 0
+lshn-qs-g2ri-2:96978:98500 [3] NCCL INFO Channel 07/64 : 0
+lshn-qs-g2ri-2:96978:98500 [3] NCCL INFO Channel 08/64 : 0
+lshn-qs-g2ri-2:96978:98500 [3] NCCL INFO Channel 09/64 : 0
+lshn-qs-g2ri-2:96978:98500 [3] NCCL INFO Channel 10/64 : 0
+lshn-qs-g2ri-2:96978:98500 [3] NCCL INFO Channel 11/64 : 0
+lshn-qs-g2ri-2:96978:98500 [3] NCCL INFO Channel 12/64 : 0
+lshn-qs-g2ri-2:96978:98500 [3] NCCL INFO Channel 13/64 : 0
+lshn-qs-g2ri-2:96978:98500 [3] NCCL INFO Channel 14/64 : 0
+lshn-qs-g2ri-2:96978:98500 [3] NCCL INFO Channel 15/64 : 0
+lshn-qs-g2ri-2:96978:98500 [3] NCCL INFO Channel 16/64 : 0
+lshn-qs-g2ri-2:96978:98500 [3] NCCL INFO Channel 17/64 : 0
+lshn-qs-g2ri-2:96978:98500 [3] NCCL INFO Channel 18/64 : 0
+lshn-qs-g2ri-2:96978:98500 [3] NCCL INFO Channel 19/64 : 0
+lshn-qs-g2ri-2:96978:98500 [3] NCCL INFO Channel 20/64 : 0
+lshn-qs-g2ri-2:96978:98500 [3] NCCL INFO Channel 21/64 : 0
+lshn-qs-g2ri-2:96978:98500 [3] NCCL INFO Channel 22/64 : 0
+lshn-qs-g2ri-2:96978:98500 [3] NCCL INFO Channel 23/64 : 0
+lshn-qs-g2ri-2:96978:98500 [3] NCCL INFO Channel 24/64 : 0
+lshn-qs-g2ri-2:96978:98500 [3] NCCL INFO Channel 25/64 : 0
+lshn-qs-g2ri-2:96978:98500 [3] NCCL INFO Channel 26/64 : 0
+lshn-qs-g2ri-2:96978:98500 [3] NCCL INFO Channel 27/64 : 0
+lshn-qs-g2ri-2:96978:98500 [3] NCCL INFO Channel 28/64 : 0
+lshn-qs-g2ri-2:96978:98500 [3] NCCL INFO Channel 29/64 : 0
+lshn-qs-g2ri-2:96978:98500 [3] NCCL INFO Channel 30/64 : 0
+lshn-qs-g2ri-2:96978:98500 [3] NCCL INFO Channel 31/64 : 0
+lshn-qs-g2ri-2:96978:98500 [3] NCCL INFO Channel 32/64 : 0
+lshn-qs-g2ri-2:96978:98500 [3] NCCL INFO Channel 33/64 : 0
+lshn-qs-g2ri-2:96978:98500 [3] NCCL INFO Channel 34/64 : 0
+lshn-qs-g2ri-2:96978:98500 [3] NCCL INFO Channel 35/64 : 0
+lshn-qs-g2ri-2:96978:98500 [3] NCCL INFO Channel 36/64 : 0
+lshn-qs-g2ri-2:96978:98500 [3] NCCL INFO Channel 37/64 : 0
+lshn-qs-g2ri-2:96978:98500 [3] NCCL INFO Channel 38/64 : 0
+lshn-qs-g2ri-2:96978:98500 [3] NCCL INFO Channel 39/64 : 0
+lshn-qs-g2ri-2:96978:98500 [3] NCCL INFO Channel 40/64 : 0
+lshn-qs-g2ri-2:96978:98500 [3] NCCL INFO Channel 41/64 : 0
+lshn-qs-g2ri-2:96978:98500 [3] NCCL INFO Channel 42/64 : 0
+lshn-qs-g2ri-2:96978:98500 [3] NCCL INFO Channel 43/64 : 0
+lshn-qs-g2ri-2:96978:98500 [3] NCCL INFO Channel 44/64 : 0
+lshn-qs-g2ri-2:96978:98500 [3] NCCL INFO Channel 45/64 : 0
+lshn-qs-g2ri-2:96978:98500 [3] NCCL INFO Channel 46/64 : 0
+lshn-qs-g2ri-2:96978:98500 [3] NCCL INFO Channel 47/64 : 0
+lshn-qs-g2ri-2:96978:98500 [3] NCCL INFO Channel 48/64 : 0
+lshn-qs-g2ri-2:96978:98500 [3] NCCL INFO Channel 49/64 : 0
+lshn-qs-g2ri-2:96978:98500 [3] NCCL INFO Channel 50/64 : 0
+lshn-qs-g2ri-2:96978:98500 [3] NCCL INFO Channel 51/64 : 0
+lshn-qs-g2ri-2:96978:98500 [3] NCCL INFO Channel 52/64 : 0
+lshn-qs-g2ri-2:96978:98500 [3] NCCL INFO Channel 53/64 : 0
+lshn-qs-g2ri-2:96978:98500 [3] NCCL INFO Channel 54/64 : 0
+lshn-qs-g2ri-2:96978:98500 [3] NCCL INFO Channel 55/64 : 0
+lshn-qs-g2ri-2:96978:98500 [3] NCCL INFO Channel 56/64 : 0
+lshn-qs-g2ri-2:96978:98500 [3] NCCL INFO Channel 57/64 : 0
+lshn-qs-g2ri-2:96978:98500 [3] NCCL INFO Channel 58/64 : 0
+lshn-qs-g2ri-2:96978:98500 [3] NCCL INFO Channel 59/64 : 0
+lshn-qs-g2ri-2:96978:98500 [3] NCCL INFO Channel 60/64 : 0
+lshn-qs-g2ri-2:96978:98500 [3] NCCL INFO Channel 61/64 : 0
+lshn-qs-g2ri-2:96978:98500 [3] NCCL INFO Channel 62/64 : 0
+lshn-qs-g2ri-2:96978:98500 [3] NCCL INFO Channel 63/64 : 0
+lshn-qs-g2ri-2:96978:98500 [3] NCCL INFO Trees [0] -1/-1/-1->0->-1 [1] -1/-1/-1->0->-1 [2] -1/-1/-1->0->-1 [3] -1/-1/-1->0->-1 [4] -1/-1/-1->0->-1 [5] -1/-1/-1->0->-1 [6] -1/-1/-1->0->-1 [7] -1/-1/-1->0->-1 [8] -1/-1/-1->0->-1 [9] -1/-1/-1->0->-1 [10] -1/-1/-1->0->-1 [11] -1/-1/-1->0->-1 [12] -1/-1/-1->0->-1 [13] -1/-1/-1->0->-1 [14] -1/-1/-1->0->-1 [15] -1/-1/-1->0->-1 [16] -1/-1/-1->0->-1 [17] -1/-1/-1->0->-1 [18] -1/-1/-1->0->-1 [19] -1/-1/-1->0->-1 [20] -1/-1/-1->0->-1 [21] -1/-1/-1->0->-1 [22] -1/-1/-1->0->-1 [23] -1/-1/-1->0->-1 [24] -1/-1/-1->0->-1 [25] -1/-1/-1->0->-1 [26] -1/-1/-1->0->-1 [27] -1/-1/-1->0->-1 [28] -1/-1/-1->0->-1 [29] -1/-1/-1->0->-1 [30] -1/-1/-1->0->-1 [31] -1/-1/-1->0->-1 [32] -1/-1/-1->0->-1 [33] -1/-1/-1->0->-1 [34] -1/-1/-1->0->-1 [35] -1/-1/-1->0->-1 [36] -1/-1/-1->0->-1 [37] -1/-1/-1->0->-1 [38] -1/-1/-1->0->-1 [39] -1/-1/-1->0->-1 [40] -1/-1/-1->0->-1 [41] -1/-1/-1->0->-1 [42] -1/-1/-1->0->-1 [43] -1/-1/-1->0->-1 [44] -1/-1/-1->0->-1 [45] -1/-1/-1->0->-1 [46] -1/-1/-1->0->-1
+lshn-qs-g2ri-2:96978:98500 [3] NCCL INFO P2P Chunksize set to 524288
+lshn-qs-g2ri-2:96978:98500 [3] NCCL INFO Check P2P Type isAllDirectP2p 0 directMode 0
+lshn-qs-g2ri-2:96978:98501 [3] NCCL INFO [Proxy Service] Device 3 CPU core 104
+lshn-qs-g2ri-2:96978:98502 [3] NCCL INFO [Proxy Service UDS] Device 3 CPU core 136
+lshn-qs-g2ri-2:96978:98500 [3] NCCL INFO 64 coll channels, 64 collnet channels, 0 nvls channels, 64 p2p channels, 64 p2p channels per peer
+lshn-qs-g2ri-2:96978:98500 [3] NCCL INFO CC Off, workFifoBytes 1048576
+lshn-qs-g2ri-2:96978:98500 [3] NCCL INFO ncclCommSplit comm 0x20de5ab0 rank 0 nranks 1 cudaDev 3 nvmlDev 3 busId c6000 parent 0x1e24ba00 splitCount 36 color 1301067556 key 0 - Init COMPLETE
+lshn-qs-g2ri-2:96978:98500 [3] NCCL INFO Init timings - ncclCommSplit: rank 0 nranks 1 total 0.03 (kernels 0.00, alloc 0.00, bootstrap 0.00, allgathers 0.00, topo 0.01, graphs 0.00, connections 0.02, rest 0.00)
+[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
+INFO 12-02 00:13:11 [parallel_state.py:1165] rank 3 in world size 4 is assigned as DP rank 0, PP rank 0, TP rank 0, EP rank 0
+INFO 12-02 00:13:11 [gpu_model_runner.py:2338] Starting to load model deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B...
+INFO 12-02 00:13:11 [gpu_model_runner.py:2338] Starting to load model deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B...
+INFO 12-02 00:13:11 [gpu_model_runner.py:2338] Starting to load model deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B...
+INFO 12-02 00:13:11 [gpu_model_runner.py:2338] Starting to load model deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B...
+INFO 12-02 00:13:11 [gpu_model_runner.py:2370] Loading model from scratch...
+INFO 12-02 00:13:12 [gpu_model_runner.py:2370] Loading model from scratch...
+INFO 12-02 00:13:12 [cuda.py:362] Using Flash Attention backend on V1 engine.
+INFO 12-02 00:13:12 [gpu_model_runner.py:2370] Loading model from scratch...
+INFO 12-02 00:13:12 [cuda.py:362] Using Flash Attention backend on V1 engine.
+INFO 12-02 00:13:12 [cuda.py:362] Using Flash Attention backend on V1 engine.
+INFO 12-02 00:13:12 [gpu_model_runner.py:2370] Loading model from scratch...
+INFO 12-02 00:13:12 [cuda.py:362] Using Flash Attention backend on V1 engine.
+INFO 12-02 00:13:12 [weight_utils.py:348] Using model weights format ['*.safetensors']
+INFO 12-02 00:13:12 [weight_utils.py:348] Using model weights format ['*.safetensors']
+INFO 12-02 00:13:12 [weight_utils.py:348] Using model weights format ['*.safetensors']
+INFO 12-02 00:13:12 [weight_utils.py:348] Using model weights format ['*.safetensors']
+INFO 12-02 00:13:13 [weight_utils.py:406] No model.safetensors.index.json found in remote.
+
+Loading safetensors checkpoint shards:   0% Completed | 0/1 [00:00<?, ?it/s]
+[AINFO 12-02 00:13:14 [weight_utils.py:369] Time spent downloading weights for deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B: 0.857579 seconds
+INFO 12-02 00:13:14 [weight_utils.py:406] No model.safetensors.index.json found in remote.
+INFO 12-02 00:13:15 [weight_utils.py:406] No model.safetensors.index.json found in remote.
+
+Loading safetensors checkpoint shards: 100% Completed | 1/1 [00:02<00:00,  2.27s/it]
+[ALoading safetensors checkpoint shards: 100% Completed | 1/1 [00:02<00:00,  2.27s/it]
+
+INFO 12-02 00:13:15 [weight_utils.py:406] No model.safetensors.index.json found in remote.
+INFO 12-02 00:13:15 [default_loader.py:268] Loading weights took 1.21 seconds
+INFO 12-02 00:13:15 [default_loader.py:268] Loading weights took 2.48 seconds
+INFO 12-02 00:13:15 [default_loader.py:268] Loading weights took 0.85 seconds
+INFO 12-02 00:13:16 [gpu_model_runner.py:2392] Model loading took 3.3461 GiB and 3.667976 seconds
+INFO 12-02 00:13:16 [gpu_model_runner.py:2392] Model loading took 3.3461 GiB and 3.852513 seconds
+INFO 12-02 00:13:16 [gpu_model_runner.py:2392] Model loading took 3.3461 GiB and 3.800579 seconds
+INFO 12-02 00:13:16 [default_loader.py:268] Loading weights took 0.79 seconds
+INFO 12-02 00:13:16 [gpu_model_runner.py:2392] Model loading took 3.3461 GiB and 4.440329 seconds
+INFO 12-02 00:13:22 [backends.py:539] Using cache directory: /mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/.cache/vllm/torch_compile_cache/63109e049c/rank_0_0/backbone for vLLM's torch.compile
+INFO 12-02 00:13:22 [backends.py:539] Using cache directory: /mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/.cache/vllm/torch_compile_cache/63109e049c/rank_3_0/backbone for vLLM's torch.compile
+INFO 12-02 00:13:22 [backends.py:550] Dynamo bytecode transform time: 5.68 s
+INFO 12-02 00:13:22 [backends.py:550] Dynamo bytecode transform time: 5.83 s
+INFO 12-02 00:13:22 [backends.py:539] Using cache directory: /mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/.cache/vllm/torch_compile_cache/63109e049c/rank_2_0/backbone for vLLM's torch.compile
+INFO 12-02 00:13:22 [backends.py:550] Dynamo bytecode transform time: 5.72 s
+INFO 12-02 00:13:22 [backends.py:539] Using cache directory: /mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/.cache/vllm/torch_compile_cache/63109e049c/rank_1_0/backbone for vLLM's torch.compile
+INFO 12-02 00:13:22 [backends.py:550] Dynamo bytecode transform time: 5.65 s
+INFO 12-02 00:13:25 [backends.py:161] Directly load the compiled graph(s) for dynamic shape from the cache, took 3.052 s
+INFO 12-02 00:13:25 [backends.py:161] Directly load the compiled graph(s) for dynamic shape from the cache, took 3.069 s
+INFO 12-02 00:13:25 [backends.py:161] Directly load the compiled graph(s) for dynamic shape from the cache, took 3.023 s
+INFO 12-02 00:13:25 [backends.py:161] Directly load the compiled graph(s) for dynamic shape from the cache, took 2.990 s
+INFO 12-02 00:13:26 [monitor.py:34] torch.compile takes 5.83 s in total
+INFO 12-02 00:13:26 [monitor.py:34] torch.compile takes 5.68 s in total
+INFO 12-02 00:13:26 [monitor.py:34] torch.compile takes 5.72 s in total
+INFO 12-02 00:13:26 [monitor.py:34] torch.compile takes 5.65 s in total
+INFO 12-02 00:13:26 [gpu_worker.py:298] Available KV cache memory: 50.14 GiB
+INFO 12-02 00:13:26 [gpu_worker.py:298] Available KV cache memory: 50.14 GiB
+INFO 12-02 00:13:26 [gpu_worker.py:298] Available KV cache memory: 50.14 GiB
+INFO 12-02 00:13:27 [gpu_worker.py:298] Available KV cache memory: 50.14 GiB
+INFO 12-02 00:13:27 [kv_cache_utils.py:864] GPU KV cache size: 1,877,584 tokens
+INFO 12-02 00:13:27 [kv_cache_utils.py:864] GPU KV cache size: 1,877,584 tokens
+INFO 12-02 00:13:27 [kv_cache_utils.py:864] GPU KV cache size: 1,877,584 tokens
+INFO 12-02 00:13:27 [kv_cache_utils.py:864] GPU KV cache size: 1,877,584 tokens
+INFO 12-02 00:13:27 [kv_cache_utils.py:868] Maximum concurrency for 16,896 tokens per request: 111.13x
+INFO 12-02 00:13:27 [kv_cache_utils.py:868] Maximum concurrency for 16,896 tokens per request: 111.13x
+INFO 12-02 00:13:27 [kv_cache_utils.py:868] Maximum concurrency for 16,896 tokens per request: 111.13x
+INFO 12-02 00:13:27 [kv_cache_utils.py:868] Maximum concurrency for 16,896 tokens per request: 111.13x
+
+Capturing CUDA graphs (mixed prefill-decode, PIECEWISE):   0%|          | 0/11 [00:00<?, ?it/s][A
+Capturing CUDA graphs (mixed prefill-decode, PIECEWISE):  36%|███▋      | 4/11 [00:00<00:00, 32.23it/s][A
+Capturing CUDA graphs (mixed prefill-decode, PIECEWISE):  73%|███████▎  | 8/11 [00:00<00:00, 34.46it/s][ACapturing CUDA graphs (mixed prefill-decode, PIECEWISE): 100%|██████████| 11/11 [00:00<00:00, 35.98it/s]
+INFO 12-02 00:13:28 [gpu_model_runner.py:3118] Graph capturing finished in 1 secs, took 0.16 GiB
+INFO 12-02 00:13:28 [gpu_worker.py:391] Free memory on device (139.08/139.81 GiB) on startup. Desired GPU memory utilization is (0.4, 55.92 GiB). Actual usage is 3.35 GiB for weight, 0.28 GiB for peak activation, 2.16 GiB for non-torch memory, and 0.16 GiB for CUDAGraph memory. Replace gpu_memory_utilization config with `--kv-cache-memory=53509203353` to fit into requested memory, or `--kv-cache-memory=142796721152` to fully utilize gpu memory. Current kv cache memory in use is 53834261913 bytes.
+INFO 12-02 00:13:28 [core.py:218] init engine (profile, create kv cache, warmup model) took 11.45 seconds
+INFO 12-02 00:13:28 [gpu_model_runner.py:3118] Graph capturing finished in 1 secs, took 0.16 GiB
+INFO 12-02 00:13:28 [gpu_worker.py:391] Free memory on device (139.08/139.81 GiB) on startup. Desired GPU memory utilization is (0.4, 55.92 GiB). Actual usage is 3.35 GiB for weight, 0.28 GiB for peak activation, 2.16 GiB for non-torch memory, and 0.16 GiB for CUDAGraph memory. Replace gpu_memory_utilization config with `--kv-cache-memory=53509203353` to fit into requested memory, or `--kv-cache-memory=142796721152` to fully utilize gpu memory. Current kv cache memory in use is 53834261913 bytes.
+INFO 12-02 00:13:28 [core.py:218] init engine (profile, create kv cache, warmup model) took 12.02 seconds
+INFO 12-02 00:13:28 [gpu_model_runner.py:3118] Graph capturing finished in 1 secs, took 0.16 GiB
+INFO 12-02 00:13:28 [gpu_worker.py:391] Free memory on device (139.08/139.81 GiB) on startup. Desired GPU memory utilization is (0.4, 55.92 GiB). Actual usage is 3.35 GiB for weight, 0.28 GiB for peak activation, 2.16 GiB for non-torch memory, and 0.16 GiB for CUDAGraph memory. Replace gpu_memory_utilization config with `--kv-cache-memory=53509203353` to fit into requested memory, or `--kv-cache-memory=142796721152` to fully utilize gpu memory. Current kv cache memory in use is 53834261913 bytes.
+INFO 12-02 00:13:28 [core.py:218] init engine (profile, create kv cache, warmup model) took 12.23 seconds
+INFO 12-02 00:13:28 [gpu_model_runner.py:3118] Graph capturing finished in 1 secs, took 0.16 GiB
+INFO 12-02 00:13:28 [gpu_worker.py:391] Free memory on device (139.08/139.81 GiB) on startup. Desired GPU memory utilization is (0.4, 55.92 GiB). Actual usage is 3.35 GiB for weight, 0.28 GiB for peak activation, 2.16 GiB for non-torch memory, and 0.16 GiB for CUDAGraph memory. Replace gpu_memory_utilization config with `--kv-cache-memory=53509203353` to fit into requested memory, or `--kv-cache-memory=142796721152` to fully utilize gpu memory. Current kv cache memory in use is 53834261913 bytes.
+INFO 12-02 00:13:28 [core.py:218] init engine (profile, create kv cache, warmup model) took 12.10 seconds
+INFO 12-02 00:13:29 [llm.py:295] Supported_tasks: ('generate',)
+INFO 12-02 00:13:29 [__init__.py:36] No IOProcessor plugins requested by the model
+lshn-qs-g2ri-2:96976:98620 [1] NCCL INFO Channel 00/0 : 1[1] -> 2[2] via P2P/CUMEM
+lshn-qs-g2ri-2:96976:98620 [1] NCCL INFO Channel 01/0 : 1[1] -> 2[2] via P2P/CUMEM
+lshn-qs-g2ri-2:96976:98620 [1] NCCL INFO Channel 02/0 : 1[1] -> 2[2] via P2P/CUMEM
+lshn-qs-g2ri-2:96976:98620 [1] NCCL INFO Channel 03/0 : 1[1] -> 2[2] via P2P/CUMEM
+lshn-qs-g2ri-2:96976:98620 [1] NCCL INFO Channel 04/0 : 1[1] -> 2[2] via P2P/CUMEM
+lshn-qs-g2ri-2:96976:98620 [1] NCCL INFO Channel 05/0 : 1[1] -> 2[2] via P2P/CUMEM
+lshn-qs-g2ri-2:96976:98620 [1] NCCL INFO Channel 06/0 : 1[1] -> 2[2] via P2P/CUMEM
+lshn-qs-g2ri-2:96976:98620 [1] NCCL INFO Channel 07/0 : 1[1] -> 2[2] via P2P/CUMEM
+lshn-qs-g2ri-2:96976:98620 [1] NCCL INFO Channel 08/0 : 1[1] -> 2[2] via P2P/CUMEM
+lshn-qs-g2ri-2:96976:98620 [1] NCCL INFO Channel 09/0 : 1[1] -> 2[2] via P2P/CUMEM
+lshn-qs-g2ri-2:96976:98620 [1] NCCL INFO Channel 10/0 : 1[1] -> 2[2] via P2P/CUMEM
+lshn-qs-g2ri-2:96976:98620 [1] NCCL INFO Channel 11/0 : 1[1] -> 2[2] via P2P/CUMEM
+lshn-qs-g2ri-2:96976:98620 [1] NCCL INFO Channel 12/0 : 1[1] -> 2[2] via P2P/CUMEM
+lshn-qs-g2ri-2:96976:98620 [1] NCCL INFO Channel 13/0 : 1[1] -> 2[2] via P2P/CUMEM
+lshn-qs-g2ri-2:96976:98620 [1] NCCL INFO Channel 14/0 : 1[1] -> 2[2] via P2P/CUMEM
+lshn-qs-g2ri-2:96976:98620 [1] NCCL INFO Channel 15/0 : 1[1] -> 2[2] via P2P/CUMEM
+lshn-qs-g2ri-2:96976:98620 [1] NCCL INFO Channel 16/0 : 1[1] -> 2[2] via P2P/CUMEM
+lshn-qs-g2ri-2:96976:98620 [1] NCCL INFO Channel 17/0 : 1[1] -> 2[2] via P2P/CUMEM
+lshn-qs-g2ri-2:96976:98620 [1] NCCL INFO Channel 18/0 : 1[1] -> 2[2] via P2P/CUMEM
+lshn-qs-g2ri-2:96976:98620 [1] NCCL INFO Channel 19/0 : 1[1] -> 2[2] via P2P/CUMEM
+lshn-qs-g2ri-2:96976:98620 [1] NCCL INFO Channel 20/0 : 1[1] -> 2[2] via P2P/CUMEM
+lshn-qs-g2ri-2:96976:98620 [1] NCCL INFO Channel 21/0 : 1[1] -> 2[2] via P2P/CUMEM
+lshn-qs-g2ri-2:96976:98620 [1] NCCL INFO Channel 22/0 : 1[1] -> 2[2] via P2P/CUMEM
+lshn-qs-g2ri-2:96976:98620 [1] NCCL INFO Channel 23/0 : 1[1] -> 2[2] via P2P/CUMEM
+INFO 12-02 00:13:29 [llm.py:295] Supported_tasks: ('generate',)
+INFO 12-02 00:13:29 [__init__.py:36] No IOProcessor plugins requested by the model
+INFO 12-02 00:13:29 [llm.py:295] Supported_tasks: ('generate',)
+INFO 12-02 00:13:29 [__init__.py:36] No IOProcessor plugins requested by the model
+lshn-qs-g2ri-2:96975:98621 [0] NCCL INFO Channel 00/0 : 0[0] -> 1[1] via P2P/CUMEM
+INFO 12-02 00:13:29 [llm.py:295] Supported_tasks: ('generate',)
+lshn-qs-g2ri-2:96975:98621 [0] NCCL INFO Channel 01/0 : 0[0] -> 1[1] via P2P/CUMEM
+lshn-qs-g2ri-2:96975:98621 [0] NCCL INFO Channel 02/0 : 0[0] -> 1[1] via P2P/CUMEM
+INFO 12-02 00:13:29 [__init__.py:36] No IOProcessor plugins requested by the model
+lshn-qs-g2ri-2:96975:98621 [0] NCCL INFO Channel 03/0 : 0[0] -> 1[1] via P2P/CUMEM
+lshn-qs-g2ri-2:96975:98621 [0] NCCL INFO Channel 04/0 : 0[0] -> 1[1] via P2P/CUMEM
+lshn-qs-g2ri-2:96975:98621 [0] NCCL INFO Channel 05/0 : 0[0] -> 1[1] via P2P/CUMEM
+lshn-qs-g2ri-2:96975:98621 [0] NCCL INFO Channel 06/0 : 0[0] -> 1[1] via P2P/CUMEM
+lshn-qs-g2ri-2:96978:98622 [3] NCCL INFO Channel 00/0 : 3[3] -> 0[0] via P2P/CUMEM
+lshn-qs-g2ri-2:96975:98621 [0] NCCL INFO Channel 07/0 : 0[0] -> 1[1] via P2P/CUMEM
+lshn-qs-g2ri-2:96978:98622 [3] NCCL INFO Channel 01/0 : 3[3] -> 0[0] via P2P/CUMEM
+lshn-qs-g2ri-2:96975:98621 [0] NCCL INFO Channel 08/0 : 0[0] -> 1[1] via P2P/CUMEM
+lshn-qs-g2ri-2:96978:98622 [3] NCCL INFO Channel 02/0 : 3[3] -> 0[0] via P2P/CUMEM
+lshn-qs-g2ri-2:96975:98621 [0] NCCL INFO Channel 09/0 : 0[0] -> 1[1] via P2P/CUMEM
+lshn-qs-g2ri-2:96978:98622 [3] NCCL INFO Channel 03/0 : 3[3] -> 0[0] via P2P/CUMEM
+lshn-qs-g2ri-2:96975:98621 [0] NCCL INFO Channel 10/0 : 0[0] -> 1[1] via P2P/CUMEM
+lshn-qs-g2ri-2:96978:98622 [3] NCCL INFO Channel 04/0 : 3[3] -> 0[0] via P2P/CUMEM
+lshn-qs-g2ri-2:96978:98622 [3] NCCL INFO Channel 05/0 : 3[3] -> 0[0] via P2P/CUMEM
+lshn-qs-g2ri-2:96975:98621 [0] NCCL INFO Channel 11/0 : 0[0] -> 1[1] via P2P/CUMEM
+lshn-qs-g2ri-2:96978:98622 [3] NCCL INFO Channel 06/0 : 3[3] -> 0[0] via P2P/CUMEM
+lshn-qs-g2ri-2:96975:98621 [0] NCCL INFO Channel 12/0 : 0[0] -> 1[1] via P2P/CUMEM
+lshn-qs-g2ri-2:96978:98622 [3] NCCL INFO Channel 07/0 : 3[3] -> 0[0] via P2P/CUMEM
+lshn-qs-g2ri-2:96975:98621 [0] NCCL INFO Channel 13/0 : 0[0] -> 1[1] via P2P/CUMEM
+lshn-qs-g2ri-2:96978:98622 [3] NCCL INFO Channel 08/0 : 3[3] -> 0[0] via P2P/CUMEM
+lshn-qs-g2ri-2:96975:98621 [0] NCCL INFO Channel 14/0 : 0[0] -> 1[1] via P2P/CUMEM
+lshn-qs-g2ri-2:96978:98622 [3] NCCL INFO Channel 09/0 : 3[3] -> 0[0] via P2P/CUMEM
+lshn-qs-g2ri-2:96975:98621 [0] NCCL INFO Channel 15/0 : 0[0] -> 1[1] via P2P/CUMEM
+lshn-qs-g2ri-2:96978:98622 [3] NCCL INFO Channel 10/0 : 3[3] -> 0[0] via P2P/CUMEM
+lshn-qs-g2ri-2:96975:98621 [0] NCCL INFO Channel 16/0 : 0[0] -> 1[1] via P2P/CUMEM
+lshn-qs-g2ri-2:96978:98622 [3] NCCL INFO Channel 11/0 : 3[3] -> 0[0] via P2P/CUMEM
+lshn-qs-g2ri-2:96975:98621 [0] NCCL INFO Channel 17/0 : 0[0] -> 1[1] via P2P/CUMEM
+lshn-qs-g2ri-2:96978:98622 [3] NCCL INFO Channel 12/0 : 3[3] -> 0[0] via P2P/CUMEM
+lshn-qs-g2ri-2:96975:98621 [0] NCCL INFO Channel 18/0 : 0[0] -> 1[1] via P2P/CUMEM
+lshn-qs-g2ri-2:96978:98622 [3] NCCL INFO Channel 13/0 : 3[3] -> 0[0] via P2P/CUMEM
+lshn-qs-g2ri-2:96975:98621 [0] NCCL INFO Channel 19/0 : 0[0] -> 1[1] via P2P/CUMEM
+lshn-qs-g2ri-2:96978:98622 [3] NCCL INFO Channel 14/0 : 3[3] -> 0[0] via P2P/CUMEM
+lshn-qs-g2ri-2:96975:98621 [0] NCCL INFO Channel 20/0 : 0[0] -> 1[1] via P2P/CUMEM
+lshn-qs-g2ri-2:96978:98622 [3] NCCL INFO Channel 15/0 : 3[3] -> 0[0] via P2P/CUMEM
+lshn-qs-g2ri-2:96975:98621 [0] NCCL INFO Channel 21/0 : 0[0] -> 1[1] via P2P/CUMEM
+lshn-qs-g2ri-2:96978:98622 [3] NCCL INFO Channel 16/0 : 3[3] -> 0[0] via P2P/CUMEM
+lshn-qs-g2ri-2:96978:98622 [3] NCCL INFO Channel 17/0 : 3[3] -> 0[0] via P2P/CUMEM
+lshn-qs-g2ri-2:96975:98621 [0] NCCL INFO Channel 22/0 : 0[0] -> 1[1] via P2P/CUMEM
+lshn-qs-g2ri-2:96978:98622 [3] NCCL INFO Channel 18/0 : 3[3] -> 0[0] via P2P/CUMEM
+lshn-qs-g2ri-2:96975:98621 [0] NCCL INFO Channel 23/0 : 0[0] -> 1[1] via P2P/CUMEM
+lshn-qs-g2ri-2:96978:98622 [3] NCCL INFO Channel 19/0 : 3[3] -> 0[0] via P2P/CUMEM
+lshn-qs-g2ri-2:96978:98622 [3] NCCL INFO Channel 20/0 : 3[3] -> 0[0] via P2P/CUMEM
+lshn-qs-g2ri-2:96978:98622 [3] NCCL INFO Channel 21/0 : 3[3] -> 0[0] via P2P/CUMEM
+lshn-qs-g2ri-2:96978:98622 [3] NCCL INFO Channel 22/0 : 3[3] -> 0[0] via P2P/CUMEM
+lshn-qs-g2ri-2:96978:98622 [3] NCCL INFO Channel 23/0 : 3[3] -> 0[0] via P2P/CUMEM
+lshn-qs-g2ri-2:96977:98623 [2] NCCL INFO Channel 00/0 : 2[2] -> 3[3] via P2P/CUMEM
+lshn-qs-g2ri-2:96977:98623 [2] NCCL INFO Channel 01/0 : 2[2] -> 3[3] via P2P/CUMEM
+lshn-qs-g2ri-2:96977:98623 [2] NCCL INFO Channel 02/0 : 2[2] -> 3[3] via P2P/CUMEM
+lshn-qs-g2ri-2:96977:98623 [2] NCCL INFO Channel 03/0 : 2[2] -> 3[3] via P2P/CUMEM
+lshn-qs-g2ri-2:96977:98623 [2] NCCL INFO Channel 04/0 : 2[2] -> 3[3] via P2P/CUMEM
+lshn-qs-g2ri-2:96977:98623 [2] NCCL INFO Channel 05/0 : 2[2] -> 3[3] via P2P/CUMEM
+lshn-qs-g2ri-2:96977:98623 [2] NCCL INFO Channel 06/0 : 2[2] -> 3[3] via P2P/CUMEM
+lshn-qs-g2ri-2:96977:98623 [2] NCCL INFO Channel 07/0 : 2[2] -> 3[3] via P2P/CUMEM
+lshn-qs-g2ri-2:96977:98623 [2] NCCL INFO Channel 08/0 : 2[2] -> 3[3] via P2P/CUMEM
+lshn-qs-g2ri-2:96977:98623 [2] NCCL INFO Channel 09/0 : 2[2] -> 3[3] via P2P/CUMEM
+lshn-qs-g2ri-2:96977:98623 [2] NCCL INFO Channel 10/0 : 2[2] -> 3[3] via P2P/CUMEM
+lshn-qs-g2ri-2:96977:98623 [2] NCCL INFO Channel 11/0 : 2[2] -> 3[3] via P2P/CUMEM
+lshn-qs-g2ri-2:96977:98623 [2] NCCL INFO Channel 12/0 : 2[2] -> 3[3] via P2P/CUMEM
+lshn-qs-g2ri-2:96977:98623 [2] NCCL INFO Channel 13/0 : 2[2] -> 3[3] via P2P/CUMEM
+lshn-qs-g2ri-2:96977:98623 [2] NCCL INFO Channel 14/0 : 2[2] -> 3[3] via P2P/CUMEM
+lshn-qs-g2ri-2:96977:98623 [2] NCCL INFO Channel 15/0 : 2[2] -> 3[3] via P2P/CUMEM
+lshn-qs-g2ri-2:96977:98623 [2] NCCL INFO Channel 16/0 : 2[2] -> 3[3] via P2P/CUMEM
+lshn-qs-g2ri-2:96977:98623 [2] NCCL INFO Channel 17/0 : 2[2] -> 3[3] via P2P/CUMEM
+lshn-qs-g2ri-2:96977:98623 [2] NCCL INFO Channel 18/0 : 2[2] -> 3[3] via P2P/CUMEM
+lshn-qs-g2ri-2:96977:98623 [2] NCCL INFO Channel 19/0 : 2[2] -> 3[3] via P2P/CUMEM
+lshn-qs-g2ri-2:96977:98623 [2] NCCL INFO Channel 20/0 : 2[2] -> 3[3] via P2P/CUMEM
+lshn-qs-g2ri-2:96977:98623 [2] NCCL INFO Channel 21/0 : 2[2] -> 3[3] via P2P/CUMEM
+lshn-qs-g2ri-2:96977:98623 [2] NCCL INFO Channel 22/0 : 2[2] -> 3[3] via P2P/CUMEM
+lshn-qs-g2ri-2:96977:98623 [2] NCCL INFO Channel 23/0 : 2[2] -> 3[3] via P2P/CUMEM
+lshn-qs-g2ri-2:96978:98622 [3] NCCL INFO Connected all rings, use ring PXN 0 GDR 1
+lshn-qs-g2ri-2:96977:98623 [2] NCCL INFO Connected all rings, use ring PXN 0 GDR 1
+lshn-qs-g2ri-2:96976:98620 [1] NCCL INFO Connected all rings, use ring PXN 0 GDR 1
+lshn-qs-g2ri-2:96975:98621 [0] NCCL INFO Connected all rings, use ring PXN 0 GDR 1
+The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'bos_token_id': 151646, 'pad_token_id': 151643}.
+The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'bos_token_id': 151646, 'pad_token_id': 151643}.
+The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'bos_token_id': 151646, 'pad_token_id': 151643}.
+The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'bos_token_id': 151646, 'pad_token_id': 151643}.
+[OpenTinker] 2025-12-02 00:13:30,340 - accelerate.accelerator - WARNING - Gradient accumulation steps mismatch: GradientAccumulationPlugin has 1, DeepSpeed config has 8. Using DeepSpeed's value.
+lshn-qs-g2ri-2:96977:96977 [2] NCCL INFO Comm config Blocking set to 1
+lshn-qs-g2ri-2:96978:96978 [3] NCCL INFO Comm config Blocking set to 1
+lshn-qs-g2ri-2:96976:96976 [1] NCCL INFO Comm config Blocking set to 1
+lshn-qs-g2ri-2:96975:96975 [0] NCCL INFO Comm config Blocking set to 1
+lshn-qs-g2ri-2:96978:98629 [3] NCCL INFO Assigned NET plugin Socket to comm
+lshn-qs-g2ri-2:96978:98629 [3] NCCL INFO Using network Socket
+lshn-qs-g2ri-2:96975:98635 [0] NCCL INFO Assigned NET plugin Socket to comm
+lshn-qs-g2ri-2:96975:98635 [0] NCCL INFO Using network Socket
+lshn-qs-g2ri-2:96976:98632 [1] NCCL INFO Assigned NET plugin Socket to comm
+lshn-qs-g2ri-2:96976:98632 [1] NCCL INFO Using network Socket
+lshn-qs-g2ri-2:96977:98626 [2] NCCL INFO Assigned NET plugin Socket to comm
+lshn-qs-g2ri-2:96977:98626 [2] NCCL INFO Using network Socket
+lshn-qs-g2ri-2:96978:98629 [3] NCCL INFO ncclCommSplit comm 0x1684d6c0 rank 3 nranks 4 cudaDev 3 nvmlDev 3 busId c6000 parent 0x1e24ba00 splitCount 37 color 2003953581 key 3- Init START
+lshn-qs-g2ri-2:96975:98635 [0] NCCL INFO ncclCommSplit comm 0x1b536c40 rank 0 nranks 4 cudaDev 0 nvmlDev 0 busId 8000 parent 0x1df75b30 splitCount 37 color 2003953581 key 0- Init START
+lshn-qs-g2ri-2:96976:98632 [1] NCCL INFO ncclCommSplit comm 0x1ab9a240 rank 1 nranks 4 cudaDev 1 nvmlDev 1 busId 7e000 parent 0x1dbf3b00 splitCount 37 color 2003953581 key 1- Init START
+lshn-qs-g2ri-2:96977:98626 [2] NCCL INFO ncclCommSplit comm 0x1c131dc0 rank 2 nranks 4 cudaDev 2 nvmlDev 2 busId a2000 parent 0x1f1d4810 splitCount 37 color 2003953581 key 2- Init START
+lshn-qs-g2ri-2:96977:98626 [2] NCCL INFO MNNVL busId 0xa2000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0
+lshn-qs-g2ri-2:96975:98635 [0] NCCL INFO MNNVL busId 0x8000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0
+lshn-qs-g2ri-2:96976:98632 [1] NCCL INFO MNNVL busId 0x7e000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0
+lshn-qs-g2ri-2:96978:98629 [3] NCCL INFO MNNVL busId 0xc6000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0
+lshn-qs-g2ri-2:96976:98632 [1] NCCL INFO Setting affinity for GPU 1 to 0-47,96-143
+lshn-qs-g2ri-2:96978:98629 [3] NCCL INFO Setting affinity for GPU 3 to 0-47,96-143
+lshn-qs-g2ri-2:96975:98635 [0] NCCL INFO Setting affinity for GPU 0 to 0-47,96-143
+lshn-qs-g2ri-2:96977:98626 [2] NCCL INFO Setting affinity for GPU 2 to 0-47,96-143
+lshn-qs-g2ri-2:96976:98632 [1] NCCL INFO comm 0x1ab9a240 rank 1 nRanks 4 nNodes 1 localRanks 4 localRank 1 MNNVL 0
+lshn-qs-g2ri-2:96978:98629 [3] NCCL INFO comm 0x1684d6c0 rank 3 nRanks 4 nNodes 1 localRanks 4 localRank 3 MNNVL 0
+lshn-qs-g2ri-2:96975:98635 [0] NCCL INFO comm 0x1b536c40 rank 0 nRanks 4 nNodes 1 localRanks 4 localRank 0 MNNVL 0
+lshn-qs-g2ri-2:96977:98626 [2] NCCL INFO comm 0x1c131dc0 rank 2 nRanks 4 nNodes 1 localRanks 4 localRank 2 MNNVL 0
+lshn-qs-g2ri-2:96976:98632 [1] NCCL INFO Trees [0] 2/-1/-1->1->0 [1] 2/-1/-1->1->0 [2] 2/-1/-1->1->0 [3] 2/-1/-1->1->0 [4] 2/-1/-1->1->0 [5] 2/-1/-1->1->0 [6] 2/-1/-1->1->0 [7] 2/-1/-1->1->0 [8] 2/-1/-1->1->0 [9] 2/-1/-1->1->0 [10] 2/-1/-1->1->0 [11] 2/-1/-1->1->0 [12] 2/-1/-1->1->0 [13] 2/-1/-1->1->0 [14] 2/-1/-1->1->0 [15] 2/-1/-1->1->0 [16] 2/-1/-1->1->0 [17] 2/-1/-1->1->0 [18] 2/-1/-1->1->0 [19] 2/-1/-1->1->0 [20] 2/-1/-1->1->0 [21] 2/-1/-1->1->0 [22] 2/-1/-1->1->0 [23] 2/-1/-1->1->0
+lshn-qs-g2ri-2:96976:98632 [1] NCCL INFO P2P Chunksize set to 524288
+lshn-qs-g2ri-2:96978:98629 [3] NCCL INFO Trees [0] -1/-1/-1->3->2 [1] -1/-1/-1->3->2 [2] -1/-1/-1->3->2 [3] -1/-1/-1->3->2 [4] -1/-1/-1->3->2 [5] -1/-1/-1->3->2 [6] -1/-1/-1->3->2 [7] -1/-1/-1->3->2 [8] -1/-1/-1->3->2 [9] -1/-1/-1->3->2 [10] -1/-1/-1->3->2 [11] -1/-1/-1->3->2 [12] -1/-1/-1->3->2 [13] -1/-1/-1->3->2 [14] -1/-1/-1->3->2 [15] -1/-1/-1->3->2 [16] -1/-1/-1->3->2 [17] -1/-1/-1->3->2 [18] -1/-1/-1->3->2 [19] -1/-1/-1->3->2 [20] -1/-1/-1->3->2 [21] -1/-1/-1->3->2 [22] -1/-1/-1->3->2 [23] -1/-1/-1->3->2
+lshn-qs-g2ri-2:96978:98629 [3] NCCL INFO P2P Chunksize set to 524288
+lshn-qs-g2ri-2:96975:98635 [0] NCCL INFO Channel 00/24 : 0 1 2 3
+lshn-qs-g2ri-2:96975:98635 [0] NCCL INFO Channel 01/24 : 0 1 2 3
+lshn-qs-g2ri-2:96977:98626 [2] NCCL INFO Trees [0] 3/-1/-1->2->1 [1] 3/-1/-1->2->1 [2] 3/-1/-1->2->1 [3] 3/-1/-1->2->1 [4] 3/-1/-1->2->1 [5] 3/-1/-1->2->1 [6] 3/-1/-1->2->1 [7] 3/-1/-1->2->1 [8] 3/-1/-1->2->1 [9] 3/-1/-1->2->1 [10] 3/-1/-1->2->1 [11] 3/-1/-1->2->1 [12] 3/-1/-1->2->1 [13] 3/-1/-1->2->1 [14] 3/-1/-1->2->1 [15] 3/-1/-1->2->1 [16] 3/-1/-1->2->1 [17] 3/-1/-1->2->1 [18] 3/-1/-1->2->1 [19] 3/-1/-1->2->1 [20] 3/-1/-1->2->1 [21] 3/-1/-1->2->1 [22] 3/-1/-1->2->1 [23] 3/-1/-1->2->1
+lshn-qs-g2ri-2:96977:98626 [2] NCCL INFO P2P Chunksize set to 524288
+lshn-qs-g2ri-2:96975:98635 [0] NCCL INFO Channel 02/24 : 0 1 2 3
+lshn-qs-g2ri-2:96975:98635 [0] NCCL INFO Channel 03/24 : 0 1 2 3
+lshn-qs-g2ri-2:96975:98635 [0] NCCL INFO Channel 04/24 : 0 1 2 3
+lshn-qs-g2ri-2:96975:98635 [0] NCCL INFO Channel 05/24 : 0 1 2 3
+lshn-qs-g2ri-2:96975:98635 [0] NCCL INFO Channel 06/24 : 0 1 2 3
+lshn-qs-g2ri-2:96975:98635 [0] NCCL INFO Channel 07/24 : 0 1 2 3
+lshn-qs-g2ri-2:96975:98635 [0] NCCL INFO Channel 08/24 : 0 1 2 3
+lshn-qs-g2ri-2:96975:98635 [0] NCCL INFO Channel 09/24 : 0 1 2 3
+lshn-qs-g2ri-2:96975:98635 [0] NCCL INFO Channel 10/24 : 0 1 2 3
+lshn-qs-g2ri-2:96975:98635 [0] NCCL INFO Channel 11/24 : 0 1 2 3
+lshn-qs-g2ri-2:96975:98635 [0] NCCL INFO Channel 12/24 : 0 1 2 3
+lshn-qs-g2ri-2:96975:98635 [0] NCCL INFO Channel 13/24 : 0 1 2 3
+lshn-qs-g2ri-2:96975:98635 [0] NCCL INFO Channel 14/24 : 0 1 2 3
+lshn-qs-g2ri-2:96975:98635 [0] NCCL INFO Channel 15/24 : 0 1 2 3
+lshn-qs-g2ri-2:96975:98635 [0] NCCL INFO Channel 16/24 : 0 1 2 3
+lshn-qs-g2ri-2:96975:98635 [0] NCCL INFO Channel 17/24 : 0 1 2 3
+lshn-qs-g2ri-2:96975:98635 [0] NCCL INFO Channel 18/24 : 0 1 2 3
+lshn-qs-g2ri-2:96975:98635 [0] NCCL INFO Channel 19/24 : 0 1 2 3
+lshn-qs-g2ri-2:96975:98635 [0] NCCL INFO Channel 20/24 : 0 1 2 3
+lshn-qs-g2ri-2:96975:98635 [0] NCCL INFO Channel 21/24 : 0 1 2 3
+lshn-qs-g2ri-2:96975:98635 [0] NCCL INFO Channel 22/24 : 0 1 2 3
+lshn-qs-g2ri-2:96975:98635 [0] NCCL INFO Channel 23/24 : 0 1 2 3
+lshn-qs-g2ri-2:96975:98635 [0] NCCL INFO Trees [0] 1/-1/-1->0->-1 [1] 1/-1/-1->0->-1 [2] 1/-1/-1->0->-1 [3] 1/-1/-1->0->-1 [4] 1/-1/-1->0->-1 [5] 1/-1/-1->0->-1 [6] 1/-1/-1->0->-1 [7] 1/-1/-1->0->-1 [8] 1/-1/-1->0->-1 [9] 1/-1/-1->0->-1 [10] 1/-1/-1->0->-1 [11] 1/-1/-1->0->-1 [12] 1/-1/-1->0->-1 [13] 1/-1/-1->0->-1 [14] 1/-1/-1->0->-1 [15] 1/-1/-1->0->-1 [16] 1/-1/-1->0->-1 [17] 1/-1/-1->0->-1 [18] 1/-1/-1->0->-1 [19] 1/-1/-1->0->-1 [20] 1/-1/-1->0->-1 [21] 1/-1/-1->0->-1 [22] 1/-1/-1->0->-1 [23] 1/-1/-1->0->-1
+lshn-qs-g2ri-2:96975:98635 [0] NCCL INFO P2P Chunksize set to 524288
+lshn-qs-g2ri-2:96976:98636 [1] NCCL INFO [Proxy Service] Device 1 CPU core 45
+lshn-qs-g2ri-2:96976:98637 [1] NCCL INFO [Proxy Service UDS] Device 1 CPU core 3
+lshn-qs-g2ri-2:96978:98638 [3] NCCL INFO [Proxy Service] Device 3 CPU core 40
+lshn-qs-g2ri-2:96978:98639 [3] NCCL INFO [Proxy Service UDS] Device 3 CPU core 102
+lshn-qs-g2ri-2:96977:98640 [2] NCCL INFO [Proxy Service] Device 2 CPU core 8
+lshn-qs-g2ri-2:96977:98641 [2] NCCL INFO [Proxy Service UDS] Device 2 CPU core 107
+lshn-qs-g2ri-2:96975:98635 [0] NCCL INFO Check P2P Type isAllDirectP2p 1 directMode 0
+lshn-qs-g2ri-2:96975:98642 [0] NCCL INFO [Proxy Service] Device 0 CPU core 21
+lshn-qs-g2ri-2:96975:98643 [0] NCCL INFO [Proxy Service UDS] Device 0 CPU core 35
+lshn-qs-g2ri-2:96976:98632 [1] NCCL INFO threadThresholds 8/8/64 | 32/8/64 | 512 | 512
+lshn-qs-g2ri-2:96976:98632 [1] NCCL INFO 24 coll channels, 24 collnet channels, 0 nvls channels, 32 p2p channels, 32 p2p channels per peer
+lshn-qs-g2ri-2:96978:98629 [3] NCCL INFO threadThresholds 8/8/64 | 32/8/64 | 512 | 512
+lshn-qs-g2ri-2:96978:98629 [3] NCCL INFO 24 coll channels, 24 collnet channels, 0 nvls channels, 32 p2p channels, 32 p2p channels per peer
+lshn-qs-g2ri-2:96977:98626 [2] NCCL INFO threadThresholds 8/8/64 | 32/8/64 | 512 | 512
+lshn-qs-g2ri-2:96977:98626 [2] NCCL INFO 24 coll channels, 24 collnet channels, 0 nvls channels, 32 p2p channels, 32 p2p channels per peer
+lshn-qs-g2ri-2:96975:98635 [0] NCCL INFO threadThresholds 8/8/64 | 32/8/64 | 512 | 512
+lshn-qs-g2ri-2:96975:98635 [0] NCCL INFO 24 coll channels, 24 collnet channels, 0 nvls channels, 32 p2p channels, 32 p2p channels per peer
+lshn-qs-g2ri-2:96975:98635 [0] NCCL INFO CC Off, workFifoBytes 1048576
+lshn-qs-g2ri-2:96978:98629 [3] NCCL INFO ncclCommSplit comm 0x1684d6c0 rank 3 nranks 4 cudaDev 3 nvmlDev 3 busId c6000 parent 0x1e24ba00 splitCount 37 color 2003953581 key 3 - Init COMPLETE
+lshn-qs-g2ri-2:96977:98626 [2] NCCL INFO ncclCommSplit comm 0x1c131dc0 rank 2 nranks 4 cudaDev 2 nvmlDev 2 busId a2000 parent 0x1f1d4810 splitCount 37 color 2003953581 key 2 - Init COMPLETE
+lshn-qs-g2ri-2:96976:98632 [1] NCCL INFO ncclCommSplit comm 0x1ab9a240 rank 1 nranks 4 cudaDev 1 nvmlDev 1 busId 7e000 parent 0x1dbf3b00 splitCount 37 color 2003953581 key 1 - Init COMPLETE
+lshn-qs-g2ri-2:96975:98635 [0] NCCL INFO ncclCommSplit comm 0x1b536c40 rank 0 nranks 4 cudaDev 0 nvmlDev 0 busId 8000 parent 0x1df75b30 splitCount 37 color 2003953581 key 0 - Init COMPLETE
+lshn-qs-g2ri-2:96978:98629 [3] NCCL INFO Init timings - ncclCommSplit: rank 3 nranks 4 total 0.12 (kernels 0.00, alloc 0.00, bootstrap 0.00, allgathers 0.00, topo 0.02, graphs 0.01, connections 0.02, rest 0.07)
+lshn-qs-g2ri-2:96977:98626 [2] NCCL INFO Init timings - ncclCommSplit: rank 2 nranks 4 total 0.13 (kernels 0.00, alloc 0.00, bootstrap 0.00, allgathers 0.00, topo 0.02, graphs 0.01, connections 0.02, rest 0.07)
+lshn-qs-g2ri-2:96976:98632 [1] NCCL INFO Init timings - ncclCommSplit: rank 1 nranks 4 total 0.12 (kernels 0.00, alloc 0.00, bootstrap 0.00, allgathers 0.00, topo 0.02, graphs 0.01, connections 0.02, rest 0.07)
+lshn-qs-g2ri-2:96975:98635 [0] NCCL INFO Init timings - ncclCommSplit: rank 0 nranks 4 total 0.08 (kernels 0.00, alloc 0.00, bootstrap 0.00, allgathers 0.00, topo 0.02, graphs 0.01, connections 0.02, rest 0.02)
+lshn-qs-g2ri-2:96978:98644 [3] NCCL INFO Channel 00/0 : 3[3] -> 0[0] via P2P/CUMEM
+lshn-qs-g2ri-2:96976:98647 [1] NCCL INFO Channel 00/0 : 1[1] -> 2[2] via P2P/CUMEM
+lshn-qs-g2ri-2:96978:98644 [3] NCCL INFO Channel 01/0 : 3[3] -> 0[0] via P2P/CUMEM
+lshn-qs-g2ri-2:96976:98647 [1] NCCL INFO Channel 01/0 : 1[1] -> 2[2] via P2P/CUMEM
+lshn-qs-g2ri-2:96978:98644 [3] NCCL INFO Channel 02/0 : 3[3] -> 0[0] via P2P/CUMEM
+lshn-qs-g2ri-2:96976:98647 [1] NCCL INFO Channel 02/0 : 1[1] -> 2[2] via P2P/CUMEM
+lshn-qs-g2ri-2:96977:98645 [2] NCCL INFO Channel 00/0 : 2[2] -> 3[3] via P2P/CUMEM
+lshn-qs-g2ri-2:96975:98646 [0] NCCL INFO Channel 00/0 : 0[0] -> 1[1] via P2P/CUMEM
+lshn-qs-g2ri-2:96978:98644 [3] NCCL INFO Channel 03/0 : 3[3] -> 0[0] via P2P/CUMEM
+lshn-qs-g2ri-2:96976:98647 [1] NCCL INFO Channel 03/0 : 1[1] -> 2[2] via P2P/CUMEM
+lshn-qs-g2ri-2:96977:98645 [2] NCCL INFO Channel 01/0 : 2[2] -> 3[3] via P2P/CUMEM
+lshn-qs-g2ri-2:96975:98646 [0] NCCL INFO Channel 01/0 : 0[0] -> 1[1] via P2P/CUMEM
+lshn-qs-g2ri-2:96978:98644 [3] NCCL INFO Channel 04/0 : 3[3] -> 0[0] via P2P/CUMEM
+lshn-qs-g2ri-2:96976:98647 [1] NCCL INFO Channel 04/0 : 1[1] -> 2[2] via P2P/CUMEM
+lshn-qs-g2ri-2:96977:98645 [2] NCCL INFO Channel 02/0 : 2[2] -> 3[3] via P2P/CUMEM
+lshn-qs-g2ri-2:96975:98646 [0] NCCL INFO Channel 02/0 : 0[0] -> 1[1] via P2P/CUMEM
+lshn-qs-g2ri-2:96978:98644 [3] NCCL INFO Channel 05/0 : 3[3] -> 0[0] via P2P/CUMEM
+lshn-qs-g2ri-2:96976:98647 [1] NCCL INFO Channel 05/0 : 1[1] -> 2[2] via P2P/CUMEM
+lshn-qs-g2ri-2:96977:98645 [2] NCCL INFO Channel 03/0 : 2[2] -> 3[3] via P2P/CUMEM
+lshn-qs-g2ri-2:96975:98646 [0] NCCL INFO Channel 03/0 : 0[0] -> 1[1] via P2P/CUMEM
+lshn-qs-g2ri-2:96978:98644 [3] NCCL INFO Channel 06/0 : 3[3] -> 0[0] via P2P/CUMEM
+lshn-qs-g2ri-2:96976:98647 [1] NCCL INFO Channel 06/0 : 1[1] -> 2[2] via P2P/CUMEM
+lshn-qs-g2ri-2:96977:98645 [2] NCCL INFO Channel 04/0 : 2[2] -> 3[3] via P2P/CUMEM
+lshn-qs-g2ri-2:96975:98646 [0] NCCL INFO Channel 04/0 : 0[0] -> 1[1] via P2P/CUMEM
+lshn-qs-g2ri-2:96978:98644 [3] NCCL INFO Channel 07/0 : 3[3] -> 0[0] via P2P/CUMEM
+lshn-qs-g2ri-2:96976:98647 [1] NCCL INFO Channel 07/0 : 1[1] -> 2[2] via P2P/CUMEM
+lshn-qs-g2ri-2:96977:98645 [2] NCCL INFO Channel 05/0 : 2[2] -> 3[3] via P2P/CUMEM
+lshn-qs-g2ri-2:96975:98646 [0] NCCL INFO Channel 05/0 : 0[0] -> 1[1] via P2P/CUMEM
+lshn-qs-g2ri-2:96978:98644 [3] NCCL INFO Channel 08/0 : 3[3] -> 0[0] via P2P/CUMEM
+lshn-qs-g2ri-2:96976:98647 [1] NCCL INFO Channel 08/0 : 1[1] -> 2[2] via P2P/CUMEM
+lshn-qs-g2ri-2:96977:98645 [2] NCCL INFO Channel 06/0 : 2[2] -> 3[3] via P2P/CUMEM
+lshn-qs-g2ri-2:96975:98646 [0] NCCL INFO Channel 06/0 : 0[0] -> 1[1] via P2P/CUMEM
+lshn-qs-g2ri-2:96978:98644 [3] NCCL INFO Channel 09/0 : 3[3] -> 0[0] via P2P/CUMEM
+lshn-qs-g2ri-2:96976:98647 [1] NCCL INFO Channel 09/0 : 1[1] -> 2[2] via P2P/CUMEM
+lshn-qs-g2ri-2:96977:98645 [2] NCCL INFO Channel 07/0 : 2[2] -> 3[3] via P2P/CUMEM
+lshn-qs-g2ri-2:96975:98646 [0] NCCL INFO Channel 07/0 : 0[0] -> 1[1] via P2P/CUMEM
+lshn-qs-g2ri-2:96978:98644 [3] NCCL INFO Channel 10/0 : 3[3] -> 0[0] via P2P/CUMEM
+lshn-qs-g2ri-2:96976:98647 [1] NCCL INFO Channel 10/0 : 1[1] -> 2[2] via P2P/CUMEM
+lshn-qs-g2ri-2:96977:98645 [2] NCCL INFO Channel 08/0 : 2[2] -> 3[3] via P2P/CUMEM
+lshn-qs-g2ri-2:96975:98646 [0] NCCL INFO Channel 08/0 : 0[0] -> 1[1] via P2P/CUMEM
+lshn-qs-g2ri-2:96978:98644 [3] NCCL INFO Channel 11/0 : 3[3] -> 0[0] via P2P/CUMEM
+lshn-qs-g2ri-2:96976:98647 [1] NCCL INFO Channel 11/0 : 1[1] -> 2[2] via P2P/CUMEM
+lshn-qs-g2ri-2:96977:98645 [2] NCCL INFO Channel 09/0 : 2[2] -> 3[3] via P2P/CUMEM
+lshn-qs-g2ri-2:96975:98646 [0] NCCL INFO Channel 09/0 : 0[0] -> 1[1] via P2P/CUMEM
+lshn-qs-g2ri-2:96978:98644 [3] NCCL INFO Channel 12/0 : 3[3] -> 0[0] via P2P/CUMEM
+lshn-qs-g2ri-2:96976:98647 [1] NCCL INFO Channel 12/0 : 1[1] -> 2[2] via P2P/CUMEM
+lshn-qs-g2ri-2:96977:98645 [2] NCCL INFO Channel 10/0 : 2[2] -> 3[3] via P2P/CUMEM
+lshn-qs-g2ri-2:96975:98646 [0] NCCL INFO Channel 10/0 : 0[0] -> 1[1] via P2P/CUMEM
+lshn-qs-g2ri-2:96978:98644 [3] NCCL INFO Channel 13/0 : 3[3] -> 0[0] via P2P/CUMEM
+lshn-qs-g2ri-2:96976:98647 [1] NCCL INFO Channel 13/0 : 1[1] -> 2[2] via P2P/CUMEM
+lshn-qs-g2ri-2:96977:98645 [2] NCCL INFO Channel 11/0 : 2[2] -> 3[3] via P2P/CUMEM
+lshn-qs-g2ri-2:96975:98646 [0] NCCL INFO Channel 11/0 : 0[0] -> 1[1] via P2P/CUMEM
+lshn-qs-g2ri-2:96978:98644 [3] NCCL INFO Channel 14/0 : 3[3] -> 0[0] via P2P/CUMEM
+lshn-qs-g2ri-2:96976:98647 [1] NCCL INFO Channel 14/0 : 1[1] -> 2[2] via P2P/CUMEM
+lshn-qs-g2ri-2:96977:98645 [2] NCCL INFO Channel 12/0 : 2[2] -> 3[3] via P2P/CUMEM
+lshn-qs-g2ri-2:96975:98646 [0] NCCL INFO Channel 12/0 : 0[0] -> 1[1] via P2P/CUMEM
+lshn-qs-g2ri-2:96978:98644 [3] NCCL INFO Channel 15/0 : 3[3] -> 0[0] via P2P/CUMEM
+lshn-qs-g2ri-2:96976:98647 [1] NCCL INFO Channel 15/0 : 1[1] -> 2[2] via P2P/CUMEM
+lshn-qs-g2ri-2:96977:98645 [2] NCCL INFO Channel 13/0 : 2[2] -> 3[3] via P2P/CUMEM
+lshn-qs-g2ri-2:96975:98646 [0] NCCL INFO Channel 13/0 : 0[0] -> 1[1] via P2P/CUMEM
+lshn-qs-g2ri-2:96978:98644 [3] NCCL INFO Channel 16/0 : 3[3] -> 0[0] via P2P/CUMEM
+lshn-qs-g2ri-2:96976:98647 [1] NCCL INFO Channel 16/0 : 1[1] -> 2[2] via P2P/CUMEM
+lshn-qs-g2ri-2:96977:98645 [2] NCCL INFO Channel 14/0 : 2[2] -> 3[3] via P2P/CUMEM
+lshn-qs-g2ri-2:96975:98646 [0] NCCL INFO Channel 14/0 : 0[0] -> 1[1] via P2P/CUMEM
+lshn-qs-g2ri-2:96978:98644 [3] NCCL INFO Channel 17/0 : 3[3] -> 0[0] via P2P/CUMEM
+lshn-qs-g2ri-2:96976:98647 [1] NCCL INFO Channel 17/0 : 1[1] -> 2[2] via P2P/CUMEM
+lshn-qs-g2ri-2:96977:98645 [2] NCCL INFO Channel 15/0 : 2[2] -> 3[3] via P2P/CUMEM
+lshn-qs-g2ri-2:96975:98646 [0] NCCL INFO Channel 15/0 : 0[0] -> 1[1] via P2P/CUMEM
+lshn-qs-g2ri-2:96978:98644 [3] NCCL INFO Channel 18/0 : 3[3] -> 0[0] via P2P/CUMEM
+lshn-qs-g2ri-2:96976:98647 [1] NCCL INFO Channel 18/0 : 1[1] -> 2[2] via P2P/CUMEM
+lshn-qs-g2ri-2:96977:98645 [2] NCCL INFO Channel 16/0 : 2[2] -> 3[3] via P2P/CUMEM
+lshn-qs-g2ri-2:96975:98646 [0] NCCL INFO Channel 16/0 : 0[0] -> 1[1] via P2P/CUMEM
+lshn-qs-g2ri-2:96978:98644 [3] NCCL INFO Channel 19/0 : 3[3] -> 0[0] via P2P/CUMEM
+lshn-qs-g2ri-2:96976:98647 [1] NCCL INFO Channel 19/0 : 1[1] -> 2[2] via P2P/CUMEM
+lshn-qs-g2ri-2:96977:98645 [2] NCCL INFO Channel 17/0 : 2[2] -> 3[3] via P2P/CUMEM
+lshn-qs-g2ri-2:96975:98646 [0] NCCL INFO Channel 17/0 : 0[0] -> 1[1] via P2P/CUMEM
+lshn-qs-g2ri-2:96978:98644 [3] NCCL INFO Channel 20/0 : 3[3] -> 0[0] via P2P/CUMEM
+lshn-qs-g2ri-2:96976:98647 [1] NCCL INFO Channel 20/0 : 1[1] -> 2[2] via P2P/CUMEM
+lshn-qs-g2ri-2:96977:98645 [2] NCCL INFO Channel 18/0 : 2[2] -> 3[3] via P2P/CUMEM
+lshn-qs-g2ri-2:96975:98646 [0] NCCL INFO Channel 18/0 : 0[0] -> 1[1] via P2P/CUMEM
+lshn-qs-g2ri-2:96978:98644 [3] NCCL INFO Channel 21/0 : 3[3] -> 0[0] via P2P/CUMEM
+lshn-qs-g2ri-2:96976:98647 [1] NCCL INFO Channel 21/0 : 1[1] -> 2[2] via P2P/CUMEM
+lshn-qs-g2ri-2:96977:98645 [2] NCCL INFO Channel 19/0 : 2[2] -> 3[3] via P2P/CUMEM
+lshn-qs-g2ri-2:96975:98646 [0] NCCL INFO Channel 19/0 : 0[0] -> 1[1] via P2P/CUMEM
+lshn-qs-g2ri-2:96978:98644 [3] NCCL INFO Channel 22/0 : 3[3] -> 0[0] via P2P/CUMEM
+lshn-qs-g2ri-2:96976:98647 [1] NCCL INFO Channel 22/0 : 1[1] -> 2[2] via P2P/CUMEM
+lshn-qs-g2ri-2:96977:98645 [2] NCCL INFO Channel 20/0 : 2[2] -> 3[3] via P2P/CUMEM
+lshn-qs-g2ri-2:96975:98646 [0] NCCL INFO Channel 20/0 : 0[0] -> 1[1] via P2P/CUMEM
+lshn-qs-g2ri-2:96978:98644 [3] NCCL INFO Channel 23/0 : 3[3] -> 0[0] via P2P/CUMEM
+lshn-qs-g2ri-2:96976:98647 [1] NCCL INFO Channel 23/0 : 1[1] -> 2[2] via P2P/CUMEM
+lshn-qs-g2ri-2:96977:98645 [2] NCCL INFO Channel 21/0 : 2[2] -> 3[3] via P2P/CUMEM
+lshn-qs-g2ri-2:96975:98646 [0] NCCL INFO Channel 21/0 : 0[0] -> 1[1] via P2P/CUMEM
+lshn-qs-g2ri-2:96977:98645 [2] NCCL INFO Channel 22/0 : 2[2] -> 3[3] via P2P/CUMEM
+lshn-qs-g2ri-2:96975:98646 [0] NCCL INFO Channel 22/0 : 0[0] -> 1[1] via P2P/CUMEM
+lshn-qs-g2ri-2:96977:98645 [2] NCCL INFO Channel 23/0 : 2[2] -> 3[3] via P2P/CUMEM
+lshn-qs-g2ri-2:96975:98646 [0] NCCL INFO Channel 23/0 : 0[0] -> 1[1] via P2P/CUMEM
+lshn-qs-g2ri-2:96977:98645 [2] NCCL INFO Connected all rings, use ring PXN 0 GDR 1
+lshn-qs-g2ri-2:96976:98647 [1] NCCL INFO Connected all rings, use ring PXN 0 GDR 1
+lshn-qs-g2ri-2:96975:98646 [0] NCCL INFO Connected all rings, use ring PXN 0 GDR 1
+lshn-qs-g2ri-2:96978:98644 [3] NCCL INFO Connected all rings, use ring PXN 0 GDR 1
+INFO 12-02 00:13:32 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 00:13:32 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 00:13:32 [block_pool.py:292] Successfully reset prefix cache
+wandb: WARNING The `run_name` is currently set to the same value as `TrainingArguments.output_dir`. If this was not intended, please specify a different run name by setting the `TrainingArguments.run_name` parameter.
+
+  0%|          | 0/1024 [00:00<?, ?it/s][AINFO 12-02 00:13:32 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 00:13:34 [chat_utils.py:538] Detected the chat template content format to be 'string'. You can set `--chat-template-content-format` to override this.
+INFO 12-02 00:13:35 [chat_utils.py:538] Detected the chat template content format to be 'string'. You can set `--chat-template-content-format` to override this.
+INFO 12-02 00:13:35 [chat_utils.py:538] Detected the chat template content format to be 'string'. You can set `--chat-template-content-format` to override this.
+INFO 12-02 00:13:35 [chat_utils.py:538] Detected the chat template content format to be 'string'. You can set `--chat-template-content-format` to override this.
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+  0%|          | 1/1024 [02:26<41:30:15, 146.06s/it][A
+                                                    [A{'loss': 0.0591, 'grad_norm': 0.0025745572056621313, 'learning_rate': 1e-05, 'num_tokens': 792270.0, 'completions/mean_length': 6039.171875, 'completions/min_length': 250.0, 'completions/max_length': 15689.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 6039.171875, 'completions/min_terminated_length': 250.0, 'completions/max_terminated_length': 15689.0, 'rewards/accuracy_reward/mean': 0.25, 'rewards/accuracy_reward/std': 0.434714138507843, 'reward': 0.25, 'reward_std': 0.24435341358184814, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.021642697975039482, 'sampling/sampling_logp_difference/max': 8.311287879943848, 'sampling/importance_sampling_ratio/min': 0.0002457273658365011, 'sampling/importance_sampling_ratio/mean': 0.9999940395355225, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.19118632376194, 'clip_ratio/low_mean': 0.0, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 0.0, 'clip_ratio/high_max': 0.0, 'clip_ratio/region_mean': 0.0, 'epoch': 0.0}
+
+  0%|          | 1/1024 [02:26<41:30:15, 146.06s/it][AINFO 12-02 00:15:58 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 00:15:58 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 00:15:58 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 00:15:58 [block_pool.py:292] Successfully reset prefix cache
+
+  0%|          | 2/1024 [04:46<40:29:28, 142.63s/it][A
+                                                    [A{'loss': 0.0258, 'grad_norm': 0.002068034838885069, 'learning_rate': 1e-05, 'num_tokens': 1425798.0, 'completions/mean_length': 4767.1875, 'completions/min_length': 556.0, 'completions/max_length': 16292.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 4767.1875, 'completions/min_terminated_length': 556.0, 'completions/max_terminated_length': 16292.0, 'rewards/accuracy_reward/mean': 0.3046875, 'rewards/accuracy_reward/std': 0.46208351850509644, 'reward': 0.3046875, 'reward_std': 0.29036980867385864, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.01877593621611595, 'sampling/sampling_logp_difference/max': 4.011071681976318, 'sampling/importance_sampling_ratio/min': 0.01811397261917591, 'sampling/importance_sampling_ratio/mean': 0.9999016523361206, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.088237851858139, 'clip_ratio/low_mean': 2.871888784738985e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 1.3748955325354473e-06, 'clip_ratio/high_max': 5.499582130141789e-06, 'clip_ratio/region_mean': 3.009378326623846e-05, 'epoch': 0.0}
+
+  0%|          | 2/1024 [04:46<40:29:28, 142.63s/it][AINFO 12-02 00:18:19 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 00:18:19 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 00:18:19 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 00:18:19 [block_pool.py:292] Successfully reset prefix cache
+
+  0%|          | 3/1024 [07:24<42:28:48, 149.78s/it][A
+                                                    [A{'loss': 0.0199, 'grad_norm': 0.001971944235265255, 'learning_rate': 1e-05, 'num_tokens': 2287420.0, 'completions/mean_length': 6586.359375, 'completions/min_length': 4.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 6351.21630859375, 'completions/min_terminated_length': 4.0, 'completions/max_terminated_length': 16317.0, 'rewards/accuracy_reward/mean': 0.28125, 'rewards/accuracy_reward/std': 0.4513758420944214, 'reward': 0.28125, 'reward_std': 0.29143062233924866, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.02137824520468712, 'sampling/sampling_logp_difference/max': 9.834577560424805, 'sampling/importance_sampling_ratio/min': 5.356698966352269e-05, 'sampling/importance_sampling_ratio/mean': 0.9999316334724426, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.0497623533010483, 'clip_ratio/low_mean': 4.459846724103045e-05, 'clip_ratio/low_min': 3.4060874440910993e-06, 'clip_ratio/high_mean': 0.0, 'clip_ratio/high_max': 0.0, 'clip_ratio/region_mean': 4.459846724103045e-05, 'epoch': 0.0}
+
+  0%|          | 3/1024 [07:24<42:28:48, 149.78s/it][AINFO 12-02 00:20:57 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 00:20:57 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 00:20:57 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 00:20:57 [block_pool.py:292] Successfully reset prefix cache
+
+  0%|          | 4/1024 [09:44<41:22:31, 146.03s/it][A
+                                                    [A{'loss': 0.0433, 'grad_norm': 0.0016465173102915287, 'learning_rate': 1e-05, 'num_tokens': 3009167.0, 'completions/mean_length': 5448.0234375, 'completions/min_length': 707.0, 'completions/max_length': 14690.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 5448.0234375, 'completions/min_terminated_length': 707.0, 'completions/max_terminated_length': 14690.0, 'rewards/accuracy_reward/mean': 0.2890625, 'rewards/accuracy_reward/std': 0.45510825514793396, 'reward': 0.2890625, 'reward_std': 0.27958330512046814, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.020580951124429703, 'sampling/sampling_logp_difference/max': 11.749992370605469, 'sampling/importance_sampling_ratio/min': 7.889385415182915e-06, 'sampling/importance_sampling_ratio/mean': 0.9999774694442749, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.1134418621659279, 'clip_ratio/low_mean': 3.218628648937738e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 5.48578327652649e-06, 'clip_ratio/high_max': 1.7640652004047297e-05, 'clip_ratio/region_mean': 3.767206976590387e-05, 'epoch': 0.0}
+
+  0%|          | 4/1024 [09:44<41:22:31, 146.03s/it][AINFO 12-02 00:23:17 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 00:23:17 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 00:23:17 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 00:23:17 [block_pool.py:292] Successfully reset prefix cache
+
+  0%|          | 5/1024 [12:19<42:14:57, 149.26s/it][A
+                                                    [A{'loss': 0.0153, 'grad_norm': 0.0023770295083522797, 'learning_rate': 1e-05, 'num_tokens': 3725654.0, 'completions/mean_length': 5436.8671875, 'completions/min_length': 26.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 5350.66943359375, 'completions/min_terminated_length': 26.0, 'completions/max_terminated_length': 13420.0, 'rewards/accuracy_reward/mean': 0.2734375, 'rewards/accuracy_reward/std': 0.447474867105484, 'reward': 0.2734375, 'reward_std': 0.27434611320495605, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.020377254113554955, 'sampling/sampling_logp_difference/max': 6.799249172210693, 'sampling/importance_sampling_ratio/min': 0.0011146117467433214, 'sampling/importance_sampling_ratio/mean': 0.99991774559021, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.1473859176039696, 'clip_ratio/low_mean': 2.8849915906903334e-05, 'clip_ratio/low_min': 8.467687621305231e-06, 'clip_ratio/high_mean': 3.359879997333337e-06, 'clip_ratio/high_max': 1.3439519989333348e-05, 'clip_ratio/region_mean': 3.220979442630778e-05, 'epoch': 0.0}
+
+  0%|          | 5/1024 [12:19<42:14:57, 149.26s/it][AINFO 12-02 00:25:52 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 00:25:52 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 00:25:52 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 00:25:52 [block_pool.py:292] Successfully reset prefix cache
+
+  1%|          | 6/1024 [14:23<39:45:51, 140.62s/it][A
+                                                    [A{'loss': 0.0326, 'grad_norm': 0.003342699259519577, 'learning_rate': 1e-05, 'num_tokens': 4345547.0, 'completions/mean_length': 4697.5390625, 'completions/min_length': 445.0, 'completions/max_length': 14440.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 4697.5390625, 'completions/min_terminated_length': 445.0, 'completions/max_terminated_length': 14440.0, 'rewards/accuracy_reward/mean': 0.390625, 'rewards/accuracy_reward/std': 0.4898075461387634, 'reward': 0.390625, 'reward_std': 0.34480881690979004, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.0185473021119833, 'sampling/sampling_logp_difference/max': 6.038198471069336, 'sampling/importance_sampling_ratio/min': 0.002385853324085474, 'sampling/importance_sampling_ratio/mean': 0.9999914765357971, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.0097229778766632, 'clip_ratio/low_mean': 2.8399212624208303e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 1.1630503422566107e-06, 'clip_ratio/high_max': 4.652201369026443e-06, 'clip_ratio/region_mean': 2.9562263534899103e-05, 'epoch': 0.01}
+
+  1%|          | 6/1024 [14:23<39:45:51, 140.62s/it][AINFO 12-02 00:27:56 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 00:27:56 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 00:27:56 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 00:27:56 [block_pool.py:292] Successfully reset prefix cache
+
+  1%|          | 7/1024 [17:03<41:28:58, 146.84s/it][A
+                                                    [A{'loss': 0.0201, 'grad_norm': 0.002073560608550906, 'learning_rate': 1e-05, 'num_tokens': 5160646.0, 'completions/mean_length': 6218.2109375, 'completions/min_length': 156.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 5890.2822265625, 'completions/min_terminated_length': 156.0, 'completions/max_terminated_length': 14652.0, 'rewards/accuracy_reward/mean': 0.2109375, 'rewards/accuracy_reward/std': 0.4095771610736847, 'reward': 0.2109375, 'reward_std': 0.27222445607185364, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.020321575924754143, 'sampling/sampling_logp_difference/max': 7.716431617736816, 'sampling/importance_sampling_ratio/min': 0.00044544730917550623, 'sampling/importance_sampling_ratio/mean': 0.9999560117721558, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.0579778030514717, 'clip_ratio/low_mean': 6.054362825125281e-05, 'clip_ratio/low_min': 7.427356649714056e-06, 'clip_ratio/high_mean': 2.340648734389106e-06, 'clip_ratio/high_max': 9.362594937556423e-06, 'clip_ratio/region_mean': 6.288427744038927e-05, 'epoch': 0.01}
+
+  1%|          | 7/1024 [17:03<41:28:58, 146.84s/it][AINFO 12-02 00:30:36 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 00:30:36 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 00:30:36 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 00:30:36 [block_pool.py:292] Successfully reset prefix cache
+
+  1%|          | 8/1024 [19:20<40:35:49, 143.85s/it][A
+                                                    [A{'loss': 0.0751, 'grad_norm': 0.0032994600478559732, 'learning_rate': 1e-05, 'num_tokens': 5836289.0, 'completions/mean_length': 5127.8359375, 'completions/min_length': 556.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 5039.20458984375, 'completions/min_terminated_length': 556.0, 'completions/max_terminated_length': 13637.0, 'rewards/accuracy_reward/mean': 0.3359375, 'rewards/accuracy_reward/std': 0.47417303919792175, 'reward': 0.3359375, 'reward_std': 0.2948455810546875, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.01940803974866867, 'sampling/sampling_logp_difference/max': 6.587049961090088, 'sampling/importance_sampling_ratio/min': 0.0013780994340777397, 'sampling/importance_sampling_ratio/mean': 0.9999483227729797, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.0472618415951729, 'clip_ratio/low_mean': 2.2175867059104348e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 2.7660169052978745e-06, 'clip_ratio/high_max': 1.1064067621191498e-05, 'clip_ratio/region_mean': 2.4941883737028547e-05, 'epoch': 0.01}
+
+  1%|          | 8/1024 [19:20<40:35:49, 143.85s/it][AINFO 12-02 00:32:53 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 00:32:53 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 00:32:53 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 00:32:53 [block_pool.py:292] Successfully reset prefix cache
+
+  1%|          | 9/1024 [21:46<40:42:41, 144.40s/it][A
+                                                    [A{'loss': 0.1006, 'grad_norm': 0.003016560571268201, 'learning_rate': 1e-05, 'num_tokens': 6433171.0, 'completions/mean_length': 4516.890625, 'completions/min_length': 238.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 4423.44873046875, 'completions/min_terminated_length': 238.0, 'completions/max_terminated_length': 15916.0, 'rewards/accuracy_reward/mean': 0.390625, 'rewards/accuracy_reward/std': 0.4898075461387634, 'reward': 0.390625, 'reward_std': 0.3066929578781128, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.017437148839235306, 'sampling/sampling_logp_difference/max': 5.206505298614502, 'sampling/importance_sampling_ratio/min': 0.005480794236063957, 'sampling/importance_sampling_ratio/mean': 0.9999179840087891, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.911251038312912, 'clip_ratio/low_mean': 3.000627111759968e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 3.0894711926521268e-06, 'clip_ratio/high_max': 1.2357884770608507e-05, 'clip_ratio/region_mean': 3.309574231025181e-05, 'epoch': 0.01}
+
+  1%|          | 9/1024 [21:46<40:42:41, 144.40s/it][AINFO 12-02 00:35:19 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 00:35:19 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 00:35:19 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 00:35:19 [block_pool.py:292] Successfully reset prefix cache
+
+  1%|          | 10/1024 [24:20<41:33:23, 147.54s/it][A
+                                                     [A{'loss': 0.0596, 'grad_norm': 0.0035386616364121437, 'learning_rate': 1e-05, 'num_tokens': 7085389.0, 'completions/mean_length': 4961.453125, 'completions/min_length': 483.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 4687.31201171875, 'completions/min_terminated_length': 483.0, 'completions/max_terminated_length': 13970.0, 'rewards/accuracy_reward/mean': 0.5625, 'rewards/accuracy_reward/std': 0.49802759289741516, 'reward': 0.5625, 'reward_std': 0.3816363215446472, 'frac_reward_zero_std': 0.1875, 'sampling/sampling_logp_difference/mean': 0.01566406339406967, 'sampling/sampling_logp_difference/max': 8.20454216003418, 'sampling/importance_sampling_ratio/min': 0.0002734088629949838, 'sampling/importance_sampling_ratio/mean': 0.9999173879623413, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.6808596402406693, 'clip_ratio/low_mean': 7.069455705277505e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 1.1582253364395001e-05, 'clip_ratio/high_max': 4.6329013457580004e-05, 'clip_ratio/region_mean': 8.227681109929108e-05, 'epoch': 0.01}
+
+  1%|          | 10/1024 [24:20<41:33:23, 147.54s/it][AINFO 12-02 00:37:53 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 00:37:53 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 00:37:53 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 00:37:53 [block_pool.py:292] Successfully reset prefix cache
+
+  1%|          | 11/1024 [26:38<40:39:23, 144.49s/it][A
+                                                     [A{'loss': 0.0031, 'grad_norm': 0.0022535293828696012, 'learning_rate': 1e-05, 'num_tokens': 7672185.0, 'completions/mean_length': 4429.40625, 'completions/min_length': 25.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 4335.275390625, 'completions/min_terminated_length': 25.0, 'completions/max_terminated_length': 14776.0, 'rewards/accuracy_reward/mean': 0.3671875, 'rewards/accuracy_reward/std': 0.4839322865009308, 'reward': 0.3671875, 'reward_std': 0.20357418060302734, 'frac_reward_zero_std': 0.5625, 'sampling/sampling_logp_difference/mean': 0.018429335206747055, 'sampling/sampling_logp_difference/max': 16.74998664855957, 'sampling/importance_sampling_ratio/min': 5.315856554943821e-08, 'sampling/importance_sampling_ratio/mean': 0.9998801946640015, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9181502386927605, 'clip_ratio/low_mean': 2.2395396172214532e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 6.079766535549425e-06, 'clip_ratio/high_max': 2.43190661421977e-05, 'clip_ratio/region_mean': 2.8475162707763957e-05, 'epoch': 0.01}
+
+  1%|          | 11/1024 [26:38<40:39:23, 144.49s/it][AINFO 12-02 00:40:11 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 00:40:11 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 00:40:11 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 00:40:11 [block_pool.py:292] Successfully reset prefix cache
+
+  1%|          | 12/1024 [29:18<41:57:31, 149.26s/it][A
+                                                     [A{'loss': 0.0971, 'grad_norm': 0.0013591813622042537, 'learning_rate': 1e-05, 'num_tokens': 8369000.0, 'completions/mean_length': 5282.6796875, 'completions/min_length': 323.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 5106.46875, 'completions/min_terminated_length': 323.0, 'completions/max_terminated_length': 14029.0, 'rewards/accuracy_reward/mean': 0.3984375, 'rewards/accuracy_reward/std': 0.4915000796318054, 'reward': 0.3984375, 'reward_std': 0.3029736578464508, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.020221836864948273, 'sampling/sampling_logp_difference/max': 10.134092330932617, 'sampling/importance_sampling_ratio/min': 3.970265970565379e-05, 'sampling/importance_sampling_ratio/mean': 0.9998897314071655, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.113751620054245, 'clip_ratio/low_mean': 1.1982813475697185e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 2.529331482037378e-06, 'clip_ratio/high_max': 1.0117325928149512e-05, 'clip_ratio/region_mean': 1.45121450714214e-05, 'epoch': 0.01}
+
+  1%|          | 12/1024 [29:18<41:57:31, 149.26s/it][AINFO 12-02 00:42:51 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 00:42:51 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 00:42:51 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 00:42:51 [block_pool.py:292] Successfully reset prefix cache
+
+  1%|▏         | 13/1024 [32:08<43:38:33, 155.40s/it][A
+                                                     [A{'loss': 0.0713, 'grad_norm': 0.0024079051800072193, 'learning_rate': 1e-05, 'num_tokens': 9283182.0, 'completions/mean_length': 6970.421875, 'completions/min_length': 53.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 6744.49609375, 'completions/min_terminated_length': 53.0, 'completions/max_terminated_length': 15925.0, 'rewards/accuracy_reward/mean': 0.171875, 'rewards/accuracy_reward/std': 0.3787541687488556, 'reward': 0.171875, 'reward_std': 0.17965975403785706, 'frac_reward_zero_std': 0.5625, 'sampling/sampling_logp_difference/mean': 0.021462474018335342, 'sampling/sampling_logp_difference/max': 7.0225830078125, 'sampling/importance_sampling_ratio/min': 0.0008915197686292231, 'sampling/importance_sampling_ratio/mean': 0.9999163746833801, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.1721933633089066, 'clip_ratio/low_mean': 2.5284593846208736e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 1.3529895568353822e-06, 'clip_ratio/high_max': 5.411958227341529e-06, 'clip_ratio/region_mean': 2.6637583516730956e-05, 'epoch': 0.01}
+
+  1%|▏         | 13/1024 [32:08<43:38:33, 155.40s/it][AINFO 12-02 00:45:41 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 00:45:41 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 00:45:41 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 00:45:41 [block_pool.py:292] Successfully reset prefix cache
+
+  1%|▏         | 14/1024 [34:16<41:18:45, 147.25s/it][A
+                                                     [A{'loss': 0.0386, 'grad_norm': 0.002063734456896782, 'learning_rate': 1e-05, 'num_tokens': 9928446.0, 'completions/mean_length': 4886.875, 'completions/min_length': 434.0, 'completions/max_length': 14658.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 4886.875, 'completions/min_terminated_length': 434.0, 'completions/max_terminated_length': 14658.0, 'rewards/accuracy_reward/mean': 0.3515625, 'rewards/accuracy_reward/std': 0.4793342351913452, 'reward': 0.3515625, 'reward_std': 0.2409384697675705, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.01918785460293293, 'sampling/sampling_logp_difference/max': 7.9093756675720215, 'sampling/importance_sampling_ratio/min': 0.0003672837920021266, 'sampling/importance_sampling_ratio/mean': 1.0000026226043701, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.0108910650014877, 'clip_ratio/low_mean': 2.4304956298237812e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 5.165383981875493e-06, 'clip_ratio/high_max': 2.0661535927501973e-05, 'clip_ratio/region_mean': 2.947033948430544e-05, 'epoch': 0.01}
+
+  1%|▏         | 14/1024 [34:16<41:18:45, 147.25s/it][AINFO 12-02 00:47:49 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 00:47:49 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 00:47:49 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 00:47:49 [block_pool.py:292] Successfully reset prefix cache
+
+  1%|▏         | 15/1024 [36:20<39:17:52, 140.21s/it][A
+                                                     [A{'loss': 0.0485, 'grad_norm': 0.002424790756776929, 'learning_rate': 1e-05, 'num_tokens': 10566415.0, 'completions/mean_length': 4824.0078125, 'completions/min_length': 103.0, 'completions/max_length': 12992.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 4824.0078125, 'completions/min_terminated_length': 103.0, 'completions/max_terminated_length': 12992.0, 'rewards/accuracy_reward/mean': 0.28125, 'rewards/accuracy_reward/std': 0.4513758420944214, 'reward': 0.28125, 'reward_std': 0.23698672652244568, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.02069389820098877, 'sampling/sampling_logp_difference/max': 6.749993801116943, 'sampling/importance_sampling_ratio/min': 0.0011708867968991399, 'sampling/importance_sampling_ratio/mean': 1.0000125169754028, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.1070282831788063, 'clip_ratio/low_mean': 4.4761846993424115e-06, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 0.0, 'clip_ratio/high_max': 0.0, 'clip_ratio/region_mean': 4.4761846993424115e-06, 'epoch': 0.01}
+
+  1%|▏         | 15/1024 [36:20<39:17:52, 140.21s/it][AINFO 12-02 00:49:53 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 00:49:53 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 00:49:53 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 00:49:53 [block_pool.py:292] Successfully reset prefix cache
+
+  2%|▏         | 16/1024 [38:37<38:59:31, 139.26s/it][A
+                                                     [A{'loss': 0.0252, 'grad_norm': 0.0021046048495918512, 'learning_rate': 1e-05, 'num_tokens': 11281908.0, 'completions/mean_length': 5449.4140625, 'completions/min_length': 23.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 5363.31494140625, 'completions/min_terminated_length': 23.0, 'completions/max_terminated_length': 12685.0, 'rewards/accuracy_reward/mean': 0.2265625, 'rewards/accuracy_reward/std': 0.4202519655227661, 'reward': 0.2265625, 'reward_std': 0.27168765664100647, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.019556276500225067, 'sampling/sampling_logp_difference/max': 4.322004318237305, 'sampling/importance_sampling_ratio/min': 0.013273254036903381, 'sampling/importance_sampling_ratio/mean': 0.9999805688858032, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9817888736724854, 'clip_ratio/low_mean': 2.2676964135825983e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 8.768976158535224e-07, 'clip_ratio/high_max': 3.5075904634140898e-06, 'clip_ratio/region_mean': 2.3553861751679506e-05, 'epoch': 0.01}
+
+  2%|▏         | 16/1024 [38:37<38:59:31, 139.26s/it][AINFO 12-02 00:52:10 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 00:52:10 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 00:52:10 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 00:52:10 [block_pool.py:292] Successfully reset prefix cache
+
+  2%|▏         | 17/1024 [40:45<38:00:43, 135.89s/it][A
+                                                     [A{'loss': 0.0584, 'grad_norm': 0.0035894038155674934, 'learning_rate': 1e-05, 'num_tokens': 11987692.0, 'completions/mean_length': 5343.25, 'completions/min_length': 324.0, 'completions/max_length': 14133.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 5343.25, 'completions/min_terminated_length': 324.0, 'completions/max_terminated_length': 14133.0, 'rewards/accuracy_reward/mean': 0.3359375, 'rewards/accuracy_reward/std': 0.47417303919792175, 'reward': 0.3359375, 'reward_std': 0.3124620020389557, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.020530637353658676, 'sampling/sampling_logp_difference/max': 10.749964714050293, 'sampling/importance_sampling_ratio/min': 2.1446165192173794e-05, 'sampling/importance_sampling_ratio/mean': 0.9998996257781982, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.04741720110178, 'clip_ratio/low_mean': 5.4349347919924185e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 4.060540163663973e-06, 'clip_ratio/high_max': 1.624216065465589e-05, 'clip_ratio/region_mean': 5.840988796990132e-05, 'epoch': 0.02}
+
+  2%|▏         | 17/1024 [40:45<38:00:43, 135.89s/it][AINFO 12-02 00:54:18 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 00:54:18 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 00:54:18 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 00:54:18 [block_pool.py:292] Successfully reset prefix cache
+
+  2%|▏         | 18/1024 [43:32<40:34:16, 145.19s/it][A
+                                                     [A{'loss': 0.0811, 'grad_norm': 0.0032246762420982122, 'learning_rate': 1e-05, 'num_tokens': 12814244.0, 'completions/mean_length': 6301.9375, 'completions/min_length': 72.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.046875, 'completions/mean_terminated_length': 5806.09814453125, 'completions/min_terminated_length': 72.0, 'completions/max_terminated_length': 15138.0, 'rewards/accuracy_reward/mean': 0.3125, 'rewards/accuracy_reward/std': 0.4653336703777313, 'reward': 0.3125, 'reward_std': 0.3606000542640686, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.017541853711009026, 'sampling/sampling_logp_difference/max': 3.846651554107666, 'sampling/importance_sampling_ratio/min': 0.021351110190153122, 'sampling/importance_sampling_ratio/mean': 0.9999184608459473, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8892941772937775, 'clip_ratio/low_mean': 4.272115029380075e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 0.0, 'clip_ratio/high_max': 0.0, 'clip_ratio/region_mean': 4.272115029380075e-05, 'epoch': 0.02}
+
+  2%|▏         | 18/1024 [43:32<40:34:16, 145.19s/it][AINFO 12-02 00:57:05 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 00:57:05 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 00:57:05 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 00:57:05 [block_pool.py:292] Successfully reset prefix cache
+
+  2%|▏         | 19/1024 [45:57<40:30:05, 145.08s/it][A
+                                                     [A{'loss': -0.0153, 'grad_norm': 0.0023969109170138836, 'learning_rate': 1e-05, 'num_tokens': 13512520.0, 'completions/mean_length': 5297.46875, 'completions/min_length': 2.0, 'completions/max_length': 16213.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 5297.46875, 'completions/min_terminated_length': 2.0, 'completions/max_terminated_length': 16213.0, 'rewards/accuracy_reward/mean': 0.359375, 'rewards/accuracy_reward/std': 0.481702595949173, 'reward': 0.359375, 'reward_std': 0.248829185962677, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.017464376986026764, 'sampling/sampling_logp_difference/max': 5.155758380889893, 'sampling/importance_sampling_ratio/min': 0.005766105372458696, 'sampling/importance_sampling_ratio/mean': 0.9999222159385681, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8097029253840446, 'clip_ratio/low_mean': 2.772165316855535e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 2.4891505745472386e-06, 'clip_ratio/high_max': 9.956602298188955e-06, 'clip_ratio/region_mean': 3.0210803743102588e-05, 'epoch': 0.02}
+
+  2%|▏         | 19/1024 [45:57<40:30:05, 145.08s/it][AINFO 12-02 00:59:30 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 00:59:30 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 00:59:30 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 00:59:30 [block_pool.py:292] Successfully reset prefix cache
+
+  2%|▏         | 20/1024 [48:29<41:04:44, 147.30s/it][A
+                                                     [A{'loss': -0.0014, 'grad_norm': 0.002759338356554508, 'learning_rate': 1e-05, 'num_tokens': 14155556.0, 'completions/mean_length': 4890.34375, 'completions/min_length': 68.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 4799.84228515625, 'completions/min_terminated_length': 68.0, 'completions/max_terminated_length': 14655.0, 'rewards/accuracy_reward/mean': 0.3515625, 'rewards/accuracy_reward/std': 0.4793342351913452, 'reward': 0.3515625, 'reward_std': 0.31010788679122925, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.018839433789253235, 'sampling/sampling_logp_difference/max': 4.768747329711914, 'sampling/importance_sampling_ratio/min': 0.008491010405123234, 'sampling/importance_sampling_ratio/mean': 0.9999570250511169, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9267145916819572, 'clip_ratio/low_mean': 3.173396362399217e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 2.524584374441474e-06, 'clip_ratio/high_max': 1.0098337497765897e-05, 'clip_ratio/region_mean': 3.425854845318099e-05, 'epoch': 0.02}
+
+  2%|▏         | 20/1024 [48:29<41:04:44, 147.30s/it][AINFO 12-02 01:02:02 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 01:02:02 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 01:02:02 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 01:02:02 [block_pool.py:292] Successfully reset prefix cache
+
+  2%|▏         | 21/1024 [50:43<39:51:57, 143.09s/it][A
+                                                     [A{'loss': -0.0051, 'grad_norm': 0.0021055075339972973, 'learning_rate': 1e-05, 'num_tokens': 14765328.0, 'completions/mean_length': 4609.40625, 'completions/min_length': 461.0, 'completions/max_length': 16296.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 4609.40625, 'completions/min_terminated_length': 461.0, 'completions/max_terminated_length': 16296.0, 'rewards/accuracy_reward/mean': 0.2421875, 'rewards/accuracy_reward/std': 0.4300905168056488, 'reward': 0.2421875, 'reward_std': 0.2409384548664093, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.020226795226335526, 'sampling/sampling_logp_difference/max': 14.437457084655762, 'sampling/importance_sampling_ratio/min': 5.368983693188056e-07, 'sampling/importance_sampling_ratio/mean': 0.9999741911888123, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.171089917421341, 'clip_ratio/low_mean': 1.9051809317716106e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 1.883097297650238e-06, 'clip_ratio/high_max': 7.532389190600952e-06, 'clip_ratio/region_mean': 2.0934906729053182e-05, 'epoch': 0.02}
+
+  2%|▏         | 21/1024 [50:43<39:51:57, 143.09s/it][AINFO 12-02 01:04:15 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 01:04:15 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 01:04:15 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 01:04:15 [block_pool.py:292] Successfully reset prefix cache
+
+  2%|▏         | 22/1024 [53:00<39:22:44, 141.48s/it][A
+                                                     [A{'loss': 0.0236, 'grad_norm': 0.0027595218271017075, 'learning_rate': 1e-05, 'num_tokens': 15438549.0, 'completions/mean_length': 5099.0390625, 'completions/min_length': 539.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 5010.18115234375, 'completions/min_terminated_length': 539.0, 'completions/max_terminated_length': 14299.0, 'rewards/accuracy_reward/mean': 0.296875, 'rewards/accuracy_reward/std': 0.45867621898651123, 'reward': 0.296875, 'reward_std': 0.20069602131843567, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.01902824640274048, 'sampling/sampling_logp_difference/max': 8.87494945526123, 'sampling/importance_sampling_ratio/min': 0.00013984869292471558, 'sampling/importance_sampling_ratio/mean': 0.9999887347221375, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.005959376692772, 'clip_ratio/low_mean': 5.869748633813288e-06, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 4.2923934415739495e-06, 'clip_ratio/high_max': 1.7169573766295798e-05, 'clip_ratio/region_mean': 1.0162142189074075e-05, 'epoch': 0.02}
+
+  2%|▏         | 22/1024 [53:00<39:22:44, 141.48s/it][AINFO 12-02 01:06:33 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 01:06:33 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 01:06:33 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 01:06:33 [block_pool.py:292] Successfully reset prefix cache
+
+  2%|▏         | 23/1024 [55:50<41:39:47, 149.84s/it][A
+                                                     [A{'loss': 0.0647, 'grad_norm': 0.002748022088780999, 'learning_rate': 1e-05, 'num_tokens': 16373898.0, 'completions/mean_length': 7138.0390625, 'completions/min_length': 729.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 6839.7822265625, 'completions/min_terminated_length': 729.0, 'completions/max_terminated_length': 16204.0, 'rewards/accuracy_reward/mean': 0.296875, 'rewards/accuracy_reward/std': 0.45867621898651123, 'reward': 0.296875, 'reward_std': 0.3169426918029785, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.020853528752923012, 'sampling/sampling_logp_difference/max': 7.874569416046143, 'sampling/importance_sampling_ratio/min': 0.0003802926803473383, 'sampling/importance_sampling_ratio/mean': 0.9999048709869385, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.0403362140059471, 'clip_ratio/low_mean': 3.6872071063953626e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 1.2907356676805648e-06, 'clip_ratio/high_max': 5.162942670722259e-06, 'clip_ratio/region_mean': 3.816280593582633e-05, 'epoch': 0.02}
+
+  2%|▏         | 23/1024 [55:50<41:39:47, 149.84s/it][AINFO 12-02 01:09:23 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 01:09:23 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 01:09:23 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 01:09:23 [block_pool.py:292] Successfully reset prefix cache
+
+  2%|▏         | 24/1024 [58:19<41:37:04, 149.82s/it][A
+                                                     [A{'loss': 0.0756, 'grad_norm': 0.0029502976685762405, 'learning_rate': 1e-05, 'num_tokens': 17088156.0, 'completions/mean_length': 5420.515625, 'completions/min_length': 19.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 5334.18896484375, 'completions/min_terminated_length': 19.0, 'completions/max_terminated_length': 14543.0, 'rewards/accuracy_reward/mean': 0.1953125, 'rewards/accuracy_reward/std': 0.3979988098144531, 'reward': 0.1953125, 'reward_std': 0.25620076060295105, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.0199423898011446, 'sampling/sampling_logp_difference/max': 9.239787101745605, 'sampling/importance_sampling_ratio/min': 9.70982582657598e-05, 'sampling/importance_sampling_ratio/mean': 0.9999445676803589, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.1339883506298065, 'clip_ratio/low_mean': 5.6506045439164154e-05, 'clip_ratio/low_min': 5.709326615033206e-06, 'clip_ratio/high_mean': 0.0, 'clip_ratio/high_max': 0.0, 'clip_ratio/region_mean': 5.6506045439164154e-05, 'epoch': 0.02}
+
+  2%|▏         | 24/1024 [58:19<41:37:04, 149.82s/it][AINFO 12-02 01:11:52 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 01:11:52 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 01:11:52 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 01:11:52 [block_pool.py:292] Successfully reset prefix cache
+
+  2%|▏         | 25/1024 [1:00:55<42:01:37, 151.45s/it][A
+                                                       [A{'loss': 0.0522, 'grad_norm': 0.004891107324510813, 'learning_rate': 1e-05, 'num_tokens': 17766619.0, 'completions/mean_length': 5116.3046875, 'completions/min_length': 10.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 4845.88037109375, 'completions/min_terminated_length': 10.0, 'completions/max_terminated_length': 14636.0, 'rewards/accuracy_reward/mean': 0.3203125, 'rewards/accuracy_reward/std': 0.4684300124645233, 'reward': 0.3203125, 'reward_std': 0.3366856575012207, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.01914183795452118, 'sampling/sampling_logp_difference/max': 6.847696304321289, 'sampling/importance_sampling_ratio/min': 0.0010618992382660508, 'sampling/importance_sampling_ratio/mean': 0.999970018863678, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9503882825374603, 'clip_ratio/low_mean': 6.439320418394345e-05, 'clip_ratio/low_min': 4.70632539872895e-06, 'clip_ratio/high_mean': 1.4049995797904558e-06, 'clip_ratio/high_max': 5.619998319161823e-06, 'clip_ratio/region_mean': 6.57982034226734e-05, 'epoch': 0.02}
+
+  2%|▏         | 25/1024 [1:00:55<42:01:37, 151.45s/it][AINFO 12-02 01:14:28 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 01:14:28 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 01:14:28 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 01:14:28 [block_pool.py:292] Successfully reset prefix cache
+
+  3%|▎         | 26/1024 [1:03:17<41:11:34, 148.59s/it][A
+                                                       [A{'loss': 0.0246, 'grad_norm': 0.0030373274348676205, 'learning_rate': 1e-05, 'num_tokens': 18432938.0, 'completions/mean_length': 5061.8671875, 'completions/min_length': 281.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 4972.71630859375, 'completions/min_terminated_length': 281.0, 'completions/max_terminated_length': 14634.0, 'rewards/accuracy_reward/mean': 0.34375, 'rewards/accuracy_reward/std': 0.47682511806488037, 'reward': 0.34375, 'reward_std': 0.28118088841438293, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.019548218697309494, 'sampling/sampling_logp_difference/max': 13.272432327270508, 'sampling/importance_sampling_ratio/min': 1.7212972807101323e-06, 'sampling/importance_sampling_ratio/mean': 0.9999624490737915, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.0540335327386856, 'clip_ratio/low_mean': 3.839018643247982e-05, 'clip_ratio/low_min': 4.115091087442124e-06, 'clip_ratio/high_mean': 0.0, 'clip_ratio/high_max': 0.0, 'clip_ratio/region_mean': 3.839018643247982e-05, 'epoch': 0.02}
+
+  3%|▎         | 26/1024 [1:03:17<41:11:34, 148.59s/it][AINFO 12-02 01:16:49 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 01:16:49 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 01:16:49 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 01:16:49 [block_pool.py:292] Successfully reset prefix cache
+
+  3%|▎         | 27/1024 [1:05:38<40:32:20, 146.38s/it][A
+                                                       [A{'loss': 0.0011, 'grad_norm': 0.0017623496241867542, 'learning_rate': 1e-05, 'num_tokens': 19264743.0, 'completions/mean_length': 6349.9765625, 'completions/min_length': 2.0, 'completions/max_length': 15782.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 6349.9765625, 'completions/min_terminated_length': 2.0, 'completions/max_terminated_length': 15782.0, 'rewards/accuracy_reward/mean': 0.2734375, 'rewards/accuracy_reward/std': 0.447474867105484, 'reward': 0.2734375, 'reward_std': 0.33903974294662476, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.019106190651655197, 'sampling/sampling_logp_difference/max': 9.585708618164062, 'sampling/importance_sampling_ratio/min': 6.870362267363816e-05, 'sampling/importance_sampling_ratio/mean': 1.0000061988830566, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.0268081277608871, 'clip_ratio/low_mean': 3.751162262233265e-05, 'clip_ratio/low_min': 4.413062470121076e-06, 'clip_ratio/high_mean': 4.665093399580655e-06, 'clip_ratio/high_max': 1.4656657867817557e-05, 'clip_ratio/region_mean': 4.2176716192443564e-05, 'epoch': 0.02}
+
+  3%|▎         | 27/1024 [1:05:38<40:32:20, 146.38s/it][AINFO 12-02 01:19:11 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 01:19:11 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 01:19:11 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 01:19:11 [block_pool.py:292] Successfully reset prefix cache
+
+  3%|▎         | 28/1024 [1:08:20<41:49:33, 151.18s/it][A
+                                                       [A{'loss': -0.0162, 'grad_norm': 0.003111837198957801, 'learning_rate': 1e-05, 'num_tokens': 20030109.0, 'completions/mean_length': 5815.484375, 'completions/min_length': 607.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 5561.84033203125, 'completions/min_terminated_length': 607.0, 'completions/max_terminated_length': 15944.0, 'rewards/accuracy_reward/mean': 0.34375, 'rewards/accuracy_reward/std': 0.47682511806488037, 'reward': 0.34375, 'reward_std': 0.32719242572784424, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.020060991868376732, 'sampling/sampling_logp_difference/max': 3.5108861923217773, 'sampling/importance_sampling_ratio/min': 0.02987043187022209, 'sampling/importance_sampling_ratio/mean': 1.0000298023223877, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.0389493256807327, 'clip_ratio/low_mean': 3.09787185415189e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 2.3053439690556843e-06, 'clip_ratio/high_max': 9.221375876222737e-06, 'clip_ratio/region_mean': 3.328406273794826e-05, 'epoch': 0.03}
+
+  3%|▎         | 28/1024 [1:08:20<41:49:33, 151.18s/it][AINFO 12-02 01:21:53 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 01:21:53 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 01:21:53 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 01:21:53 [block_pool.py:292] Successfully reset prefix cache
+
+  3%|▎         | 29/1024 [1:10:51<41:44:16, 151.01s/it][A
+                                                       [A{'loss': 0.0471, 'grad_norm': 0.003041633637621999, 'learning_rate': 1e-05, 'num_tokens': 20710904.0, 'completions/mean_length': 5157.1484375, 'completions/min_length': 11.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 5068.748046875, 'completions/min_terminated_length': 11.0, 'completions/max_terminated_length': 15371.0, 'rewards/accuracy_reward/mean': 0.3125, 'rewards/accuracy_reward/std': 0.4653336703777313, 'reward': 0.3125, 'reward_std': 0.35612428188323975, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.019007597118616104, 'sampling/sampling_logp_difference/max': 3.133340835571289, 'sampling/importance_sampling_ratio/min': 0.04357198625802994, 'sampling/importance_sampling_ratio/mean': 0.9999587535858154, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.0510126948356628, 'clip_ratio/low_mean': 4.474762545214617e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 1.6952535588643514e-06, 'clip_ratio/high_max': 6.7810142354574054e-06, 'clip_ratio/region_mean': 4.644287901101052e-05, 'epoch': 0.03}
+
+  3%|▎         | 29/1024 [1:10:51<41:44:16, 151.01s/it][AINFO 12-02 01:24:24 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 01:24:24 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 01:24:24 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 01:24:24 [block_pool.py:292] Successfully reset prefix cache
+
+  3%|▎         | 30/1024 [1:13:00<39:51:44, 144.37s/it][A
+                                                       [A{'loss': -0.001, 'grad_norm': 0.0022369560319930315, 'learning_rate': 1e-05, 'num_tokens': 21298497.0, 'completions/mean_length': 4446.3828125, 'completions/min_length': 73.0, 'completions/max_length': 15333.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 4446.3828125, 'completions/min_terminated_length': 73.0, 'completions/max_terminated_length': 15333.0, 'rewards/accuracy_reward/mean': 0.390625, 'rewards/accuracy_reward/std': 0.4898075461387634, 'reward': 0.390625, 'reward_std': 0.24169495701789856, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.01947362720966339, 'sampling/sampling_logp_difference/max': 5.00492525100708, 'sampling/importance_sampling_ratio/min': 0.006704842206090689, 'sampling/importance_sampling_ratio/mean': 0.9998750686645508, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.053279548883438, 'clip_ratio/low_mean': 2.0962848566341563e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 0.0, 'clip_ratio/high_max': 0.0, 'clip_ratio/region_mean': 2.0962848566341563e-05, 'epoch': 0.03}
+
+  3%|▎         | 30/1024 [1:13:00<39:51:44, 144.37s/it][AINFO 12-02 01:26:33 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 01:26:33 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 01:26:33 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 01:26:33 [block_pool.py:292] Successfully reset prefix cache
+
+  3%|▎         | 31/1024 [1:15:34<40:37:23, 147.27s/it][A
+                                                       [A{'loss': 0.033, 'grad_norm': 0.0021383841522037983, 'learning_rate': 1e-05, 'num_tokens': 22124812.0, 'completions/mean_length': 6294.1484375, 'completions/min_length': 548.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 6133.9921875, 'completions/min_terminated_length': 548.0, 'completions/max_terminated_length': 15386.0, 'rewards/accuracy_reward/mean': 0.171875, 'rewards/accuracy_reward/std': 0.3787541687488556, 'reward': 0.171875, 'reward_std': 0.20752590894699097, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.022367021068930626, 'sampling/sampling_logp_difference/max': 14.742476463317871, 'sampling/importance_sampling_ratio/min': 3.9575263599544996e-07, 'sampling/importance_sampling_ratio/mean': 0.9999858736991882, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.2036212533712387, 'clip_ratio/low_mean': 2.8460265411922592e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 0.0, 'clip_ratio/high_max': 0.0, 'clip_ratio/region_mean': 2.8460265411922592e-05, 'epoch': 0.03}
+
+  3%|▎         | 31/1024 [1:15:34<40:37:23, 147.27s/it][AINFO 12-02 01:29:07 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 01:29:07 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 01:29:07 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 01:29:07 [block_pool.py:292] Successfully reset prefix cache
+
+  3%|▎         | 32/1024 [1:18:18<41:59:50, 152.41s/it][A
+                                                       [A{'loss': 0.0119, 'grad_norm': 0.00126531848218292, 'learning_rate': 1e-05, 'num_tokens': 22915091.0, 'completions/mean_length': 6011.4921875, 'completions/min_length': 2.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 5929.81884765625, 'completions/min_terminated_length': 2.0, 'completions/max_terminated_length': 14893.0, 'rewards/accuracy_reward/mean': 0.171875, 'rewards/accuracy_reward/std': 0.3787541687488556, 'reward': 0.171875, 'reward_std': 0.2330477386713028, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.019905246794223785, 'sampling/sampling_logp_difference/max': 11.02016544342041, 'sampling/importance_sampling_ratio/min': 1.6368276192224585e-05, 'sampling/importance_sampling_ratio/mean': 0.9999861121177673, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.123318687081337, 'clip_ratio/low_mean': 3.19569651310303e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 4.34661660619895e-06, 'clip_ratio/high_max': 1.73864664247958e-05, 'clip_ratio/region_mean': 3.630358173722925e-05, 'epoch': 0.03}
+
+  3%|▎         | 32/1024 [1:18:18<41:59:50, 152.41s/it][AINFO 12-02 01:31:51 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 01:31:51 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 01:31:51 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 01:31:51 [block_pool.py:292] Successfully reset prefix cache
+
+  3%|▎         | 33/1024 [1:20:43<41:18:05, 150.04s/it][A
+                                                       [A{'loss': 0.1026, 'grad_norm': 0.0030069497879594564, 'learning_rate': 1e-05, 'num_tokens': 23596487.0, 'completions/mean_length': 5152.46875, 'completions/min_length': 128.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 5064.03125, 'completions/min_terminated_length': 128.0, 'completions/max_terminated_length': 16332.0, 'rewards/accuracy_reward/mean': 0.3359375, 'rewards/accuracy_reward/std': 0.47417303919792175, 'reward': 0.3359375, 'reward_std': 0.29142576456069946, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.019003981724381447, 'sampling/sampling_logp_difference/max': 13.919804573059082, 'sampling/importance_sampling_ratio/min': 9.009604013954231e-07, 'sampling/importance_sampling_ratio/mean': 0.9999433755874634, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.0477670058608055, 'clip_ratio/low_mean': 3.818478444372886e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 7.188304493865871e-06, 'clip_ratio/high_max': 2.8753217975463485e-05, 'clip_ratio/region_mean': 4.537308905128157e-05, 'epoch': 0.03}
+
+  3%|▎         | 33/1024 [1:20:43<41:18:05, 150.04s/it][AINFO 12-02 01:34:15 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 01:34:15 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 01:34:15 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 01:34:15 [block_pool.py:292] Successfully reset prefix cache
+
+  3%|▎         | 34/1024 [1:22:31<37:48:11, 137.47s/it][A
+                                                       [A{'loss': 0.0724, 'grad_norm': 0.002676331205293536, 'learning_rate': 1e-05, 'num_tokens': 24213408.0, 'completions/mean_length': 4672.5703125, 'completions/min_length': 4.0, 'completions/max_length': 12792.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 4672.5703125, 'completions/min_terminated_length': 4.0, 'completions/max_terminated_length': 12792.0, 'rewards/accuracy_reward/mean': 0.3203125, 'rewards/accuracy_reward/std': 0.4684300124645233, 'reward': 0.3203125, 'reward_std': 0.2988021969795227, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.0179576613008976, 'sampling/sampling_logp_difference/max': 6.618741989135742, 'sampling/importance_sampling_ratio/min': 0.0013351094676181674, 'sampling/importance_sampling_ratio/mean': 1.0000251531600952, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9471446052193642, 'clip_ratio/low_mean': 3.4847614415411954e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 7.673938625885057e-06, 'clip_ratio/high_max': 3.069575450354023e-05, 'clip_ratio/region_mean': 4.252155258654966e-05, 'epoch': 0.03}
+
+  3%|▎         | 34/1024 [1:22:31<37:48:11, 137.47s/it][AINFO 12-02 01:36:04 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 01:36:04 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 01:36:04 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 01:36:04 [block_pool.py:292] Successfully reset prefix cache
+
+  3%|▎         | 35/1024 [1:25:08<39:24:48, 143.47s/it][A
+                                                       [A{'loss': 0.0384, 'grad_norm': 0.0023615453392267227, 'learning_rate': 1e-05, 'num_tokens': 25130262.0, 'completions/mean_length': 7013.734375, 'completions/min_length': 326.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 6711.4677734375, 'completions/min_terminated_length': 326.0, 'completions/max_terminated_length': 16108.0, 'rewards/accuracy_reward/mean': 0.1953125, 'rewards/accuracy_reward/std': 0.3979988098144531, 'reward': 0.1953125, 'reward_std': 0.26485776901245117, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.0215257927775383, 'sampling/sampling_logp_difference/max': 11.925450325012207, 'sampling/importance_sampling_ratio/min': 6.6197676460433286e-06, 'sampling/importance_sampling_ratio/mean': 0.999954342842102, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.1254516392946243, 'clip_ratio/low_mean': 2.6127243245355203e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 0.0, 'clip_ratio/high_max': 0.0, 'clip_ratio/region_mean': 2.6127243245355203e-05, 'epoch': 0.03}
+
+  3%|▎         | 35/1024 [1:25:08<39:24:48, 143.47s/it][AINFO 12-02 01:38:41 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 01:38:41 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 01:38:41 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 01:38:41 [block_pool.py:292] Successfully reset prefix cache
+
+  4%|▎         | 36/1024 [1:27:52<41:03:34, 149.61s/it][A
+                                                       [A{'loss': 0.1041, 'grad_norm': 0.0029018481727689505, 'learning_rate': 1e-05, 'num_tokens': 25898194.0, 'completions/mean_length': 5858.59375, 'completions/min_length': 189.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 5605.984375, 'completions/min_terminated_length': 189.0, 'completions/max_terminated_length': 15673.0, 'rewards/accuracy_reward/mean': 0.3671875, 'rewards/accuracy_reward/std': 0.4839322865009308, 'reward': 0.3671875, 'reward_std': 0.29036980867385864, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.019959844648838043, 'sampling/sampling_logp_difference/max': 10.992064476013184, 'sampling/importance_sampling_ratio/min': 1.6834765119710937e-05, 'sampling/importance_sampling_ratio/mean': 0.9999915957450867, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.0713739022612572, 'clip_ratio/low_mean': 4.180071573500754e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 1.01738510238647e-06, 'clip_ratio/high_max': 4.06954040954588e-06, 'clip_ratio/region_mean': 4.281810015527299e-05, 'epoch': 0.03}
+
+  4%|▎         | 36/1024 [1:27:52<41:03:34, 149.61s/it][AINFO 12-02 01:41:25 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 01:41:25 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 01:41:25 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 01:41:25 [block_pool.py:292] Successfully reset prefix cache
+
+  4%|▎         | 37/1024 [1:30:49<43:15:34, 157.79s/it][A
+                                                       [A{'loss': 0.0637, 'grad_norm': 0.0022342968732118607, 'learning_rate': 1e-05, 'num_tokens': 26812791.0, 'completions/mean_length': 6952.6015625, 'completions/min_length': 3.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 6726.24853515625, 'completions/min_terminated_length': 3.0, 'completions/max_terminated_length': 14004.0, 'rewards/accuracy_reward/mean': 0.234375, 'rewards/accuracy_reward/std': 0.42527204751968384, 'reward': 0.234375, 'reward_std': 0.26827272772789, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.02002539485692978, 'sampling/sampling_logp_difference/max': 9.999964714050293, 'sampling/importance_sampling_ratio/min': 4.540153167909011e-05, 'sampling/importance_sampling_ratio/mean': 0.9999532699584961, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.028619796037674, 'clip_ratio/low_mean': 3.29701083501277e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 3.2027068073148257e-06, 'clip_ratio/high_max': 1.2810827229259303e-05, 'clip_ratio/region_mean': 3.617281504375569e-05, 'epoch': 0.03}
+
+  4%|▎         | 37/1024 [1:30:49<43:15:34, 157.79s/it][AINFO 12-02 01:44:22 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 01:44:22 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 01:44:22 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 01:44:22 [block_pool.py:292] Successfully reset prefix cache
+
+  4%|▎         | 38/1024 [1:33:37<44:03:48, 160.88s/it][A
+                                                       [A{'loss': 0.0275, 'grad_norm': 0.0021800603717565536, 'learning_rate': 1e-05, 'num_tokens': 27652757.0, 'completions/mean_length': 6413.421875, 'completions/min_length': 2.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 6174.12841796875, 'completions/min_terminated_length': 2.0, 'completions/max_terminated_length': 16361.0, 'rewards/accuracy_reward/mean': 0.296875, 'rewards/accuracy_reward/std': 0.45867621898651123, 'reward': 0.296875, 'reward_std': 0.31246688961982727, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.019722118973731995, 'sampling/sampling_logp_difference/max': 10.153130531311035, 'sampling/importance_sampling_ratio/min': 3.895394547726028e-05, 'sampling/importance_sampling_ratio/mean': 0.9999439120292664, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9452399462461472, 'clip_ratio/low_mean': 4.09088329433871e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 6.960676159906143e-06, 'clip_ratio/high_max': 1.5225089100567857e-05, 'clip_ratio/region_mean': 4.7869508762232726e-05, 'epoch': 0.03}
+
+  4%|▎         | 38/1024 [1:33:37<44:03:48, 160.88s/it][AINFO 12-02 01:47:10 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 01:47:10 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 01:47:10 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 01:47:10 [block_pool.py:292] Successfully reset prefix cache
+
+  4%|▍         | 39/1024 [1:35:54<42:01:02, 153.57s/it][A
+                                                       [A{'loss': 0.0798, 'grad_norm': 0.004153470974415541, 'learning_rate': 1e-05, 'num_tokens': 28334386.0, 'completions/mean_length': 5176.3515625, 'completions/min_length': 130.0, 'completions/max_length': 15754.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 5176.3515625, 'completions/min_terminated_length': 130.0, 'completions/max_terminated_length': 15754.0, 'rewards/accuracy_reward/mean': 0.2734375, 'rewards/accuracy_reward/std': 0.447474867105484, 'reward': 0.2734375, 'reward_std': 0.32035762071609497, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.020159056410193443, 'sampling/sampling_logp_difference/max': 4.903430938720703, 'sampling/importance_sampling_ratio/min': 0.007421077694743872, 'sampling/importance_sampling_ratio/mean': 0.9999774694442749, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.0444758981466293, 'clip_ratio/low_mean': 3.9564903318023426e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 0.0, 'clip_ratio/high_max': 0.0, 'clip_ratio/region_mean': 3.9564903318023426e-05, 'epoch': 0.04}
+
+  4%|▍         | 39/1024 [1:35:54<42:01:02, 153.57s/it][AINFO 12-02 01:49:27 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 01:49:27 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 01:49:27 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 01:49:27 [block_pool.py:292] Successfully reset prefix cache
+
+  4%|▍         | 40/1024 [1:38:20<41:23:47, 151.45s/it][A
+                                                       [A{'loss': 0.0067, 'grad_norm': 0.002477057045325637, 'learning_rate': 1e-05, 'num_tokens': 29017145.0, 'completions/mean_length': 5178.9921875, 'completions/min_length': 4.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 5001.13525390625, 'completions/min_terminated_length': 4.0, 'completions/max_terminated_length': 15554.0, 'rewards/accuracy_reward/mean': 0.2890625, 'rewards/accuracy_reward/std': 0.45510825514793396, 'reward': 0.2890625, 'reward_std': 0.29932135343551636, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.019826076924800873, 'sampling/sampling_logp_difference/max': 5.374985694885254, 'sampling/importance_sampling_ratio/min': 0.004630985204130411, 'sampling/importance_sampling_ratio/mean': 1.0000497102737427, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.0803537145256996, 'clip_ratio/low_mean': 2.0204584302518924e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 4.3143586481164675e-06, 'clip_ratio/high_max': 1.725743459246587e-05, 'clip_ratio/region_mean': 2.451894306432223e-05, 'epoch': 0.04}
+
+  4%|▍         | 40/1024 [1:38:20<41:23:47, 151.45s/it][AINFO 12-02 01:51:53 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 01:51:53 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 01:51:53 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 01:51:53 [block_pool.py:292] Successfully reset prefix cache
+
+  4%|▍         | 41/1024 [1:40:31<39:39:21, 145.23s/it][A
+                                                       [A{'loss': -0.0097, 'grad_norm': 0.0016275218222290277, 'learning_rate': 1e-05, 'num_tokens': 29673535.0, 'completions/mean_length': 4980.359375, 'completions/min_length': 329.0, 'completions/max_length': 14131.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 4980.359375, 'completions/min_terminated_length': 329.0, 'completions/max_terminated_length': 14131.0, 'rewards/accuracy_reward/mean': 0.4375, 'rewards/accuracy_reward/std': 0.49802759289741516, 'reward': 0.4375, 'reward_std': 0.26249876618385315, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.01844976656138897, 'sampling/sampling_logp_difference/max': 7.419386386871338, 'sampling/importance_sampling_ratio/min': 0.000599516904912889, 'sampling/importance_sampling_ratio/mean': 0.9999750852584839, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9510642662644386, 'clip_ratio/low_mean': 2.1970684144889674e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 4.159498075750889e-06, 'clip_ratio/high_max': 1.6637992303003557e-05, 'clip_ratio/region_mean': 2.6130182106953725e-05, 'epoch': 0.04}
+
+  4%|▍         | 41/1024 [1:40:31<39:39:21, 145.23s/it][AINFO 12-02 01:54:04 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 01:54:04 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 01:54:04 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 01:54:04 [block_pool.py:292] Successfully reset prefix cache
+
+  4%|▍         | 42/1024 [1:43:04<40:13:22, 147.46s/it][A
+                                                       [A{'loss': -0.0045, 'grad_norm': 0.001847646082751453, 'learning_rate': 1e-05, 'num_tokens': 30436416.0, 'completions/mean_length': 5778.6953125, 'completions/min_length': 691.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 5695.18896484375, 'completions/min_terminated_length': 691.0, 'completions/max_terminated_length': 15328.0, 'rewards/accuracy_reward/mean': 0.2578125, 'rewards/accuracy_reward/std': 0.43914902210235596, 'reward': 0.2578125, 'reward_std': 0.33903977274894714, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.021502099931240082, 'sampling/sampling_logp_difference/max': 8.499895095825195, 'sampling/importance_sampling_ratio/min': 0.00020348970429040492, 'sampling/importance_sampling_ratio/mean': 0.9998501539230347, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.0413239300251007, 'clip_ratio/low_mean': 3.9683913541921356e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 7.021798296591442e-06, 'clip_ratio/high_max': 2.8087193186365766e-05, 'clip_ratio/region_mean': 4.670571286169434e-05, 'epoch': 0.04}
+
+  4%|▍         | 42/1024 [1:43:04<40:13:22, 147.46s/it][AINFO 12-02 01:56:36 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 01:56:36 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 01:56:36 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 01:56:36 [block_pool.py:292] Successfully reset prefix cache
+
+  4%|▍         | 43/1024 [1:45:01<37:41:52, 138.34s/it][A
+                                                       [A{'loss': 0.0727, 'grad_norm': 0.0021134833805263042, 'learning_rate': 1e-05, 'num_tokens': 31083672.0, 'completions/mean_length': 4875.125, 'completions/min_length': 349.0, 'completions/max_length': 14182.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 4875.125, 'completions/min_terminated_length': 349.0, 'completions/max_terminated_length': 14182.0, 'rewards/accuracy_reward/mean': 0.40625, 'rewards/accuracy_reward/std': 0.4930621087551117, 'reward': 0.40625, 'reward_std': 0.3584783971309662, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.019140049815177917, 'sampling/sampling_logp_difference/max': 4.41345739364624, 'sampling/importance_sampling_ratio/min': 0.012113225646317005, 'sampling/importance_sampling_ratio/mean': 0.9999340176582336, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.0464690178632736, 'clip_ratio/low_mean': 4.547183698377921e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 8.575278570788214e-06, 'clip_ratio/high_max': 2.68402091023745e-05, 'clip_ratio/region_mean': 5.404711600931478e-05, 'epoch': 0.04}
+
+  4%|▍         | 43/1024 [1:45:01<37:41:52, 138.34s/it][AINFO 12-02 01:58:33 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 01:58:33 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 01:58:33 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 01:58:33 [block_pool.py:292] Successfully reset prefix cache
+
+  4%|▍         | 44/1024 [1:47:16<37:25:49, 137.50s/it][A
+                                                       [A{'loss': 0.0603, 'grad_norm': 0.0037735572550445795, 'learning_rate': 1e-05, 'num_tokens': 31703654.0, 'completions/mean_length': 4691.421875, 'completions/min_length': 296.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 4505.82568359375, 'completions/min_terminated_length': 296.0, 'completions/max_terminated_length': 15959.0, 'rewards/accuracy_reward/mean': 0.4453125, 'rewards/accuracy_reward/std': 0.4989531338214874, 'reward': 0.4453125, 'reward_std': 0.2993389964103699, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.01912039890885353, 'sampling/sampling_logp_difference/max': 3.457747459411621, 'sampling/importance_sampling_ratio/min': 0.03150063753128052, 'sampling/importance_sampling_ratio/mean': 0.9999492168426514, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.0229775309562683, 'clip_ratio/low_mean': 3.981287841270387e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 9.969491998162994e-06, 'clip_ratio/high_max': 3.9877967992651975e-05, 'clip_ratio/region_mean': 4.9782369273998484e-05, 'epoch': 0.04}
+
+  4%|▍         | 44/1024 [1:47:16<37:25:49, 137.50s/it][AINFO 12-02 02:00:49 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 02:00:49 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 02:00:49 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 02:00:49 [block_pool.py:292] Successfully reset prefix cache
+
+  4%|▍         | 45/1024 [1:49:51<38:50:35, 142.83s/it][A
+                                                       [A{'loss': 0.022, 'grad_norm': 0.0010448681423440576, 'learning_rate': 1e-05, 'num_tokens': 32599778.0, 'completions/mean_length': 6821.96875, 'completions/min_length': 1196.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 6592.48046875, 'completions/min_terminated_length': 1196.0, 'completions/max_terminated_length': 15605.0, 'rewards/accuracy_reward/mean': 0.2265625, 'rewards/accuracy_reward/std': 0.4202519655227661, 'reward': 0.2265625, 'reward_std': 0.1814819872379303, 'frac_reward_zero_std': 0.625, 'sampling/sampling_logp_difference/mean': 0.02125459350645542, 'sampling/sampling_logp_difference/max': 5.035848140716553, 'sampling/importance_sampling_ratio/min': 0.006500681862235069, 'sampling/importance_sampling_ratio/mean': 0.999915361404419, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.1132484003901482, 'clip_ratio/low_mean': 1.5137359810069029e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 8.860472462401958e-07, 'clip_ratio/high_max': 3.5441889849607833e-06, 'clip_ratio/region_mean': 1.6023407056309225e-05, 'epoch': 0.04}
+
+  4%|▍         | 45/1024 [1:49:51<38:50:35, 142.83s/it][AINFO 12-02 02:03:24 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 02:03:24 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 02:03:24 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 02:03:24 [block_pool.py:292] Successfully reset prefix cache
+
+  4%|▍         | 46/1024 [1:52:39<40:50:45, 150.35s/it][A
+                                                       [A{'loss': 0.1105, 'grad_norm': 0.0017670176457613707, 'learning_rate': 1e-05, 'num_tokens': 33492737.0, 'completions/mean_length': 6834.3671875, 'completions/min_length': 624.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 6605.17626953125, 'completions/min_terminated_length': 624.0, 'completions/max_terminated_length': 15753.0, 'rewards/accuracy_reward/mean': 0.3046875, 'rewards/accuracy_reward/std': 0.46208351850509644, 'reward': 0.3046875, 'reward_std': 0.3440523147583008, 'frac_reward_zero_std': 0.1875, 'sampling/sampling_logp_difference/mean': 0.019490526989102364, 'sampling/sampling_logp_difference/max': 6.156240463256836, 'sampling/importance_sampling_ratio/min': 0.0021202093921601772, 'sampling/importance_sampling_ratio/mean': 0.9999089241027832, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9827468693256378, 'clip_ratio/low_mean': 5.731516603191267e-05, 'clip_ratio/low_min': 9.891066838463303e-06, 'clip_ratio/high_mean': 1.1632234873104608e-06, 'clip_ratio/high_max': 4.652893949241843e-06, 'clip_ratio/region_mean': 5.8478389746596804e-05, 'epoch': 0.04}
+
+  4%|▍         | 46/1024 [1:52:39<40:50:45, 150.35s/it][AINFO 12-02 02:06:12 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 02:06:12 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 02:06:12 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 02:06:12 [block_pool.py:292] Successfully reset prefix cache
+
+  5%|▍         | 47/1024 [1:55:25<42:03:29, 154.97s/it][A
+                                                       [A{'loss': -0.0008, 'grad_norm': 0.0016891945851966739, 'learning_rate': 1e-05, 'num_tokens': 34312455.0, 'completions/mean_length': 6175.296875, 'completions/min_length': 558.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 5845.98388671875, 'completions/min_terminated_length': 558.0, 'completions/max_terminated_length': 14098.0, 'rewards/accuracy_reward/mean': 0.1875, 'rewards/accuracy_reward/std': 0.39184603095054626, 'reward': 0.1875, 'reward_std': 0.19673937559127808, 'frac_reward_zero_std': 0.5625, 'sampling/sampling_logp_difference/mean': 0.021749887615442276, 'sampling/sampling_logp_difference/max': 9.422743797302246, 'sampling/importance_sampling_ratio/min': 8.086384332273155e-05, 'sampling/importance_sampling_ratio/mean': 0.9999643564224243, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.1584237962961197, 'clip_ratio/low_mean': 2.5672919832686603e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 2.503530367903295e-06, 'clip_ratio/high_max': 6.717360520269722e-06, 'clip_ratio/region_mean': 2.8176450200589898e-05, 'epoch': 0.04}
+
+  5%|▍         | 47/1024 [1:55:25<42:03:29, 154.97s/it][AINFO 12-02 02:08:58 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 02:08:58 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 02:08:58 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 02:08:58 [block_pool.py:292] Successfully reset prefix cache
+
+  5%|▍         | 48/1024 [1:58:09<42:42:41, 157.54s/it][A
+                                                       [A{'loss': 0.0636, 'grad_norm': 0.003834392176941037, 'learning_rate': 1e-05, 'num_tokens': 35102738.0, 'completions/mean_length': 6036.8359375, 'completions/min_length': 510.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 5955.3623046875, 'completions/min_terminated_length': 510.0, 'completions/max_terminated_length': 16054.0, 'rewards/accuracy_reward/mean': 0.4375, 'rewards/accuracy_reward/std': 0.49802759289741516, 'reward': 0.4375, 'reward_std': 0.36614155769348145, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.019147861748933792, 'sampling/sampling_logp_difference/max': 8.874411582946777, 'sampling/importance_sampling_ratio/min': 0.00013992394087836146, 'sampling/importance_sampling_ratio/mean': 0.9998494386672974, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9301538467407227, 'clip_ratio/low_mean': 2.1058204993096297e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 8.189798336388776e-06, 'clip_ratio/high_max': 2.2362002255249536e-05, 'clip_ratio/region_mean': 2.9248002192616696e-05, 'epoch': 0.04}
+
+  5%|▍         | 48/1024 [1:58:09<42:42:41, 157.54s/it][AINFO 12-02 02:11:41 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 02:11:41 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 02:11:41 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 02:11:41 [block_pool.py:292] Successfully reset prefix cache
+
+  5%|▍         | 49/1024 [2:01:03<43:59:57, 162.46s/it][A
+                                                       [A{'loss': 0.0495, 'grad_norm': 0.0029804729856550694, 'learning_rate': 1e-05, 'num_tokens': 35924886.0, 'completions/mean_length': 6262.46875, 'completions/min_length': 210.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.046875, 'completions/mean_terminated_length': 5764.68798828125, 'completions/min_terminated_length': 210.0, 'completions/max_terminated_length': 15594.0, 'rewards/accuracy_reward/mean': 0.3984375, 'rewards/accuracy_reward/std': 0.4915000796318054, 'reward': 0.3984375, 'reward_std': 0.3911295533180237, 'frac_reward_zero_std': 0.125, 'sampling/sampling_logp_difference/mean': 0.01815103553235531, 'sampling/sampling_logp_difference/max': 9.904524803161621, 'sampling/importance_sampling_ratio/min': 0.00021375219512265176, 'sampling/importance_sampling_ratio/mean': 0.9999922513961792, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8599015846848488, 'clip_ratio/low_mean': 4.08189714562468e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 2.8754903951266897e-06, 'clip_ratio/high_max': 1.1501961580506759e-05, 'clip_ratio/region_mean': 4.369446196506033e-05, 'epoch': 0.05}
+
+  5%|▍         | 49/1024 [2:01:03<43:59:57, 162.46s/it][AINFO 12-02 02:14:35 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 02:14:35 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 02:14:35 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 02:14:35 [block_pool.py:292] Successfully reset prefix cache
+
+  5%|▍         | 50/1024 [2:03:32<42:52:20, 158.46s/it][A
+                                                       [A{'loss': 0.0059, 'grad_norm': 0.0022071697749197483, 'learning_rate': 1e-05, 'num_tokens': 36700913.0, 'completions/mean_length': 5926.8984375, 'completions/min_length': 346.0, 'completions/max_length': 14556.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 5926.8984375, 'completions/min_terminated_length': 346.0, 'completions/max_terminated_length': 14556.0, 'rewards/accuracy_reward/mean': 0.3359375, 'rewards/accuracy_reward/std': 0.47417303919792175, 'reward': 0.3359375, 'reward_std': 0.3306073546409607, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.01954064890742302, 'sampling/sampling_logp_difference/max': 7.557773113250732, 'sampling/importance_sampling_ratio/min': 0.0005220364546403289, 'sampling/importance_sampling_ratio/mean': 1.000010371208191, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.0042993426322937, 'clip_ratio/low_mean': 3.6588148361715866e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 6.026886012477917e-06, 'clip_ratio/high_max': 2.4107544049911667e-05, 'clip_ratio/region_mean': 4.261503391944643e-05, 'epoch': 0.05}
+
+  5%|▍         | 50/1024 [2:03:32<42:52:20, 158.46s/it][AINFO 12-02 02:17:05 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 02:17:05 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 02:17:05 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 02:17:05 [block_pool.py:292] Successfully reset prefix cache
+
+  5%|▍         | 51/1024 [2:06:19<43:34:05, 161.20s/it][A
+                                                       [A{'loss': 0.0508, 'grad_norm': 0.002320924773812294, 'learning_rate': 1e-05, 'num_tokens': 37604865.0, 'completions/mean_length': 6873.6875, 'completions/min_length': 505.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 6645.4404296875, 'completions/min_terminated_length': 505.0, 'completions/max_terminated_length': 15217.0, 'rewards/accuracy_reward/mean': 0.234375, 'rewards/accuracy_reward/std': 0.42527204751968384, 'reward': 0.234375, 'reward_std': 0.3135228157043457, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.019532475620508194, 'sampling/sampling_logp_difference/max': 3.6437859535217285, 'sampling/importance_sampling_ratio/min': 0.026153141632676125, 'sampling/importance_sampling_ratio/mean': 0.9999098777770996, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.0255412608385086, 'clip_ratio/low_mean': 2.634599570683349e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 1.2276566394575639e-06, 'clip_ratio/high_max': 4.9106265578302555e-06, 'clip_ratio/region_mean': 2.7573652346291055e-05, 'epoch': 0.05}
+
+  5%|▍         | 51/1024 [2:06:19<43:34:05, 161.20s/it][AINFO 12-02 02:19:52 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 02:19:52 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 02:19:52 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 02:19:52 [block_pool.py:292] Successfully reset prefix cache
+
+  5%|▌         | 52/1024 [2:08:52<42:52:24, 158.79s/it][A
+                                                       [A{'loss': 0.0668, 'grad_norm': 0.002212709980085492, 'learning_rate': 1e-05, 'num_tokens': 38405196.0, 'completions/mean_length': 6073.8984375, 'completions/min_length': 654.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 5992.71630859375, 'completions/min_terminated_length': 654.0, 'completions/max_terminated_length': 15668.0, 'rewards/accuracy_reward/mean': 0.359375, 'rewards/accuracy_reward/std': 0.481702595949173, 'reward': 0.359375, 'reward_std': 0.22119548916816711, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.021252838894724846, 'sampling/sampling_logp_difference/max': 11.651445388793945, 'sampling/importance_sampling_ratio/min': 8.706459084351081e-06, 'sampling/importance_sampling_ratio/mean': 0.9998978972434998, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.0713753998279572, 'clip_ratio/low_mean': 2.351988746340794e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 4.087627530680038e-06, 'clip_ratio/high_max': 1.6350510122720152e-05, 'clip_ratio/region_mean': 2.7607515221461654e-05, 'epoch': 0.05}
+
+  5%|▌         | 52/1024 [2:08:52<42:52:24, 158.79s/it][AINFO 12-02 02:22:25 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 02:22:25 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 02:22:25 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 02:22:25 [block_pool.py:292] Successfully reset prefix cache
+
+  5%|▌         | 53/1024 [2:11:20<41:56:07, 155.48s/it][A
+                                                       [A{'loss': 0.0178, 'grad_norm': 0.001135052996687591, 'learning_rate': 1e-05, 'num_tokens': 39171704.0, 'completions/mean_length': 5838.71875, 'completions/min_length': 331.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 5671.33349609375, 'completions/min_terminated_length': 331.0, 'completions/max_terminated_length': 15299.0, 'rewards/accuracy_reward/mean': 0.28125, 'rewards/accuracy_reward/std': 0.4513758420944214, 'reward': 0.28125, 'reward_std': 0.23410367965698242, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.020781882107257843, 'sampling/sampling_logp_difference/max': 5.7812418937683105, 'sampling/importance_sampling_ratio/min': 0.003084881929680705, 'sampling/importance_sampling_ratio/mean': 0.9999173879623413, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.021155133843422, 'clip_ratio/low_mean': 3.729486718384578e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 0.0, 'clip_ratio/high_max': 0.0, 'clip_ratio/region_mean': 3.729486718384578e-05, 'epoch': 0.05}
+
+  5%|▌         | 53/1024 [2:11:20<41:56:07, 155.48s/it][AINFO 12-02 02:24:53 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 02:24:53 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 02:24:53 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 02:24:53 [block_pool.py:292] Successfully reset prefix cache
+
+  5%|▌         | 54/1024 [2:13:48<41:16:54, 153.21s/it][A
+                                                       [A{'loss': 0.0145, 'grad_norm': 0.002638082252815366, 'learning_rate': 1e-05, 'num_tokens': 40003859.0, 'completions/mean_length': 6358.5859375, 'completions/min_length': 940.0, 'completions/max_length': 14617.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 6358.5859375, 'completions/min_terminated_length': 940.0, 'completions/max_terminated_length': 14617.0, 'rewards/accuracy_reward/mean': 0.40625, 'rewards/accuracy_reward/std': 0.4930621087551117, 'reward': 0.40625, 'reward_std': 0.3174618184566498, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.01991666667163372, 'sampling/sampling_logp_difference/max': 3.932096481323242, 'sampling/importance_sampling_ratio/min': 0.01960253342986107, 'sampling/importance_sampling_ratio/mean': 1.0000380277633667, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9720487147569656, 'clip_ratio/low_mean': 3.706903294187214e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 4.281042436105054e-06, 'clip_ratio/high_max': 1.7124169744420215e-05, 'clip_ratio/region_mean': 4.135007543482061e-05, 'epoch': 0.05}
+
+  5%|▌         | 54/1024 [2:13:48<41:16:54, 153.21s/it][AINFO 12-02 02:27:21 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 02:27:21 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 02:27:21 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 02:27:21 [block_pool.py:292] Successfully reset prefix cache
+
+  5%|▌         | 55/1024 [2:16:07<40:02:47, 148.78s/it][A
+                                                       [A{'loss': 0.0221, 'grad_norm': 0.001994960242882371, 'learning_rate': 1e-05, 'num_tokens': 40616483.0, 'completions/mean_length': 4634.1875, 'completions/min_length': 266.0, 'completions/max_length': 14753.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 4634.1875, 'completions/min_terminated_length': 266.0, 'completions/max_terminated_length': 14753.0, 'rewards/accuracy_reward/mean': 0.4375, 'rewards/accuracy_reward/std': 0.49802759289741516, 'reward': 0.4375, 'reward_std': 0.29644322395324707, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.01902047172188759, 'sampling/sampling_logp_difference/max': 11.46318244934082, 'sampling/importance_sampling_ratio/min': 1.0510009815334342e-05, 'sampling/importance_sampling_ratio/mean': 1.0000698566436768, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9715309366583824, 'clip_ratio/low_mean': 2.222621503733535e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 2.994117721755174e-06, 'clip_ratio/high_max': 6.55582925901399e-06, 'clip_ratio/region_mean': 2.5220332759090525e-05, 'epoch': 0.05}
+
+  5%|▌         | 55/1024 [2:16:07<40:02:47, 148.78s/it][AINFO 12-02 02:29:39 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 02:29:39 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 02:29:39 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 02:29:39 [block_pool.py:292] Successfully reset prefix cache
+
+  5%|▌         | 56/1024 [2:19:08<42:36:32, 158.46s/it][A
+                                                       [A{'loss': 0.0506, 'grad_norm': 0.0019902780186384916, 'learning_rate': 1e-05, 'num_tokens': 41484443.0, 'completions/mean_length': 6596.25, 'completions/min_length': 318.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 6361.34423828125, 'completions/min_terminated_length': 318.0, 'completions/max_terminated_length': 15953.0, 'rewards/accuracy_reward/mean': 0.4453125, 'rewards/accuracy_reward/std': 0.4989531338214874, 'reward': 0.4453125, 'reward_std': 0.326668381690979, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.018301833420991898, 'sampling/sampling_logp_difference/max': 9.499993324279785, 'sampling/importance_sampling_ratio/min': 7.485233072657138e-05, 'sampling/importance_sampling_ratio/mean': 1.000016689300537, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8207943215966225, 'clip_ratio/low_mean': 4.3583780325207044e-05, 'clip_ratio/low_min': 4.6013396968191955e-06, 'clip_ratio/high_mean': 7.571314540655294e-06, 'clip_ratio/high_max': 2.2474248908110894e-05, 'clip_ratio/region_mean': 5.1155094070054474e-05, 'epoch': 0.05}
+
+  5%|▌         | 56/1024 [2:19:08<42:36:32, 158.46s/it][AINFO 12-02 02:32:40 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 02:32:40 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 02:32:40 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 02:32:40 [block_pool.py:292] Successfully reset prefix cache
+
+  6%|▌         | 57/1024 [2:21:46<42:34:42, 158.51s/it][A
+                                                       [A{'loss': 0.0128, 'grad_norm': 0.0015235114842653275, 'learning_rate': 1e-05, 'num_tokens': 42372235.0, 'completions/mean_length': 6785.75, 'completions/min_length': 393.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.046875, 'completions/mean_terminated_length': 6313.70458984375, 'completions/min_terminated_length': 393.0, 'completions/max_terminated_length': 15834.0, 'rewards/accuracy_reward/mean': 0.2421875, 'rewards/accuracy_reward/std': 0.4300905168056488, 'reward': 0.2421875, 'reward_std': 0.325075626373291, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.019945615902543068, 'sampling/sampling_logp_difference/max': 3.6238646507263184, 'sampling/importance_sampling_ratio/min': 0.026679370552301407, 'sampling/importance_sampling_ratio/mean': 0.9999551773071289, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9876058474183083, 'clip_ratio/low_mean': 4.332785601945943e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 7.504983159378753e-07, 'clip_ratio/high_max': 3.0019932637515012e-06, 'clip_ratio/region_mean': 4.407835376696312e-05, 'epoch': 0.05}
+
+  6%|▌         | 57/1024 [2:21:46<42:34:42, 158.51s/it][AINFO 12-02 02:35:19 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 02:35:19 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 02:35:19 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 02:35:19 [block_pool.py:292] Successfully reset prefix cache
+[OpenTinker] 2025-12-02 02:37:03,220 - math_verify.grader - WARNING - Timeout during comparison
+
+  6%|▌         | 58/1024 [2:24:29<42:54:19, 159.90s/it][A
+                                                       [A{'loss': 0.1088, 'grad_norm': 0.002033712575212121, 'learning_rate': 1e-05, 'num_tokens': 43015238.0, 'completions/mean_length': 4881.2109375, 'completions/min_length': 437.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 4510.1533203125, 'completions/min_terminated_length': 437.0, 'completions/max_terminated_length': 14726.0, 'rewards/accuracy_reward/mean': 0.4375, 'rewards/accuracy_reward/std': 0.49802759289741516, 'reward': 0.4375, 'reward_std': 0.2869548797607422, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.01887543685734272, 'sampling/sampling_logp_difference/max': 8.996081352233887, 'sampling/importance_sampling_ratio/min': 0.0001238943514181301, 'sampling/importance_sampling_ratio/mean': 1.0000300407409668, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.989942155778408, 'clip_ratio/low_mean': 2.1349006601667497e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 0.0, 'clip_ratio/high_max': 0.0, 'clip_ratio/region_mean': 2.1349006601667497e-05, 'epoch': 0.05}
+
+  6%|▌         | 58/1024 [2:24:29<42:54:19, 159.90s/it][AINFO 12-02 02:38:02 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 02:38:02 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 02:38:02 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 02:38:02 [block_pool.py:292] Successfully reset prefix cache
+
+  6%|▌         | 59/1024 [2:26:38<40:20:43, 150.51s/it][A
+                                                       [A{'loss': 0.0691, 'grad_norm': 0.0030296226032078266, 'learning_rate': 1e-05, 'num_tokens': 43637737.0, 'completions/mean_length': 4725.3984375, 'completions/min_length': 117.0, 'completions/max_length': 15001.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 4725.3984375, 'completions/min_terminated_length': 117.0, 'completions/max_terminated_length': 15001.0, 'rewards/accuracy_reward/mean': 0.4453125, 'rewards/accuracy_reward/std': 0.4989531338214874, 'reward': 0.4453125, 'reward_std': 0.32035762071609497, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.01995944231748581, 'sampling/sampling_logp_difference/max': 8.380389213562012, 'sampling/importance_sampling_ratio/min': 0.00022932067804504186, 'sampling/importance_sampling_ratio/mean': 0.9999939203262329, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.0350637435913086, 'clip_ratio/low_mean': 2.1371045761497953e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 6.4600117184454575e-06, 'clip_ratio/high_max': 2.584004687378183e-05, 'clip_ratio/region_mean': 2.7831058105221018e-05, 'epoch': 0.05}
+
+  6%|▌         | 59/1024 [2:26:38<40:20:43, 150.51s/it][AINFO 12-02 02:40:11 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 02:40:11 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 02:40:11 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 02:40:11 [block_pool.py:292] Successfully reset prefix cache
+
+  6%|▌         | 60/1024 [2:29:03<39:54:18, 149.02s/it][A
+                                                       [A{'loss': 0.0845, 'grad_norm': 0.002758471528068185, 'learning_rate': 1e-05, 'num_tokens': 44285327.0, 'completions/mean_length': 4918.171875, 'completions/min_length': 388.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 4736.1748046875, 'completions/min_terminated_length': 388.0, 'completions/max_terminated_length': 16240.0, 'rewards/accuracy_reward/mean': 0.328125, 'rewards/accuracy_reward/std': 0.4713755249977112, 'reward': 0.328125, 'reward_std': 0.27328526973724365, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.019083233550190926, 'sampling/sampling_logp_difference/max': 4.513625144958496, 'sampling/importance_sampling_ratio/min': 0.010958661325275898, 'sampling/importance_sampling_ratio/mean': 0.9999663233757019, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.965274304151535, 'clip_ratio/low_mean': 3.5168303838872816e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 4.986834937881213e-06, 'clip_ratio/high_max': 1.994733975152485e-05, 'clip_ratio/region_mean': 4.015513832200668e-05, 'epoch': 0.06}
+
+  6%|▌         | 60/1024 [2:29:03<39:54:18, 149.02s/it][AINFO 12-02 02:42:36 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 02:42:36 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 02:42:36 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 02:42:36 [block_pool.py:292] Successfully reset prefix cache
+
+  6%|▌         | 61/1024 [2:31:29<39:36:28, 148.07s/it][A
+                                                       [A{'loss': 0.0575, 'grad_norm': 0.0027611786499619484, 'learning_rate': 1e-05, 'num_tokens': 44944356.0, 'completions/mean_length': 4998.2890625, 'completions/min_length': 524.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 4908.6376953125, 'completions/min_terminated_length': 524.0, 'completions/max_terminated_length': 15031.0, 'rewards/accuracy_reward/mean': 0.3515625, 'rewards/accuracy_reward/std': 0.4793342351913452, 'reward': 0.3515625, 'reward_std': 0.3895368278026581, 'frac_reward_zero_std': 0.1875, 'sampling/sampling_logp_difference/mean': 0.017853498458862305, 'sampling/sampling_logp_difference/max': 6.284412384033203, 'sampling/importance_sampling_ratio/min': 0.0018651526188477874, 'sampling/importance_sampling_ratio/mean': 0.9999884366989136, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9200445115566254, 'clip_ratio/low_mean': 3.838553107016196e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 2.6553909719950752e-06, 'clip_ratio/high_max': 1.0621563887980301e-05, 'clip_ratio/region_mean': 4.1040922042157035e-05, 'epoch': 0.06}
+
+  6%|▌         | 61/1024 [2:31:29<39:36:28, 148.07s/it][AINFO 12-02 02:45:02 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 02:45:02 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 02:45:02 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 02:45:02 [block_pool.py:292] Successfully reset prefix cache
+
+  6%|▌         | 62/1024 [2:34:03<40:02:05, 149.82s/it][A
+                                                       [A{'loss': 0.0111, 'grad_norm': 0.0015557854203507304, 'learning_rate': 1e-05, 'num_tokens': 45767867.0, 'completions/mean_length': 6290.1796875, 'completions/min_length': 302.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 6129.96044921875, 'completions/min_terminated_length': 302.0, 'completions/max_terminated_length': 16110.0, 'rewards/accuracy_reward/mean': 0.34375, 'rewards/accuracy_reward/std': 0.47682511806488037, 'reward': 0.34375, 'reward_std': 0.30168038606643677, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.0200855303555727, 'sampling/sampling_logp_difference/max': 6.812033176422119, 'sampling/importance_sampling_ratio/min': 0.0011004531988874078, 'sampling/importance_sampling_ratio/mean': 0.9999427795410156, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9360214695334435, 'clip_ratio/low_mean': 2.0260404085092887e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 2.534156124056608e-06, 'clip_ratio/high_max': 1.0136624496226432e-05, 'clip_ratio/region_mean': 2.2794560095462657e-05, 'epoch': 0.06}
+
+  6%|▌         | 62/1024 [2:34:03<40:02:05, 149.82s/it][AINFO 12-02 02:47:36 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 02:47:36 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 02:47:36 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 02:47:36 [block_pool.py:292] Successfully reset prefix cache
+
+  6%|▌         | 63/1024 [2:37:05<42:33:32, 159.43s/it][A
+                                                       [A{'loss': -0.0194, 'grad_norm': 0.0026953541673719883, 'learning_rate': 1e-05, 'num_tokens': 46618575.0, 'completions/mean_length': 6486.15625, 'completions/min_length': 77.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 6248.6083984375, 'completions/min_terminated_length': 77.0, 'completions/max_terminated_length': 16366.0, 'rewards/accuracy_reward/mean': 0.2578125, 'rewards/accuracy_reward/std': 0.43914902210235596, 'reward': 0.2578125, 'reward_std': 0.2580180764198303, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.01863238587975502, 'sampling/sampling_logp_difference/max': 6.749991416931152, 'sampling/importance_sampling_ratio/min': 0.0011708897072821856, 'sampling/importance_sampling_ratio/mean': 0.9999406337738037, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.863138921558857, 'clip_ratio/low_mean': 4.51761221711422e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 5.6398778269795e-07, 'clip_ratio/high_max': 2.2559511307918e-06, 'clip_ratio/region_mean': 4.574010984015331e-05, 'epoch': 0.06}
+
+  6%|▌         | 63/1024 [2:37:05<42:33:32, 159.43s/it][AINFO 12-02 02:50:38 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 02:50:38 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 02:50:38 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 02:50:38 [block_pool.py:292] Successfully reset prefix cache
+
+  6%|▋         | 64/1024 [2:39:39<42:02:09, 157.63s/it][A
+                                                       [A{'loss': 0.0569, 'grad_norm': 0.0024442619178444147, 'learning_rate': 1e-05, 'num_tokens': 47462274.0, 'completions/mean_length': 6442.7734375, 'completions/min_length': 776.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 6284.9765625, 'completions/min_terminated_length': 776.0, 'completions/max_terminated_length': 16010.0, 'rewards/accuracy_reward/mean': 0.328125, 'rewards/accuracy_reward/std': 0.4713755249977112, 'reward': 0.328125, 'reward_std': 0.2777610421180725, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.019810764119029045, 'sampling/sampling_logp_difference/max': 19.124980926513672, 'sampling/importance_sampling_ratio/min': 4.9445447736218284e-09, 'sampling/importance_sampling_ratio/mean': 0.9998892545700073, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.0242054909467697, 'clip_ratio/low_mean': 2.787370635815023e-05, 'clip_ratio/low_min': 3.837534222839167e-06, 'clip_ratio/high_mean': 2.518339442758588e-06, 'clip_ratio/high_max': 1.0073357771034352e-05, 'clip_ratio/region_mean': 3.0392045573535142e-05, 'epoch': 0.06}
+
+  6%|▋         | 64/1024 [2:39:39<42:02:09, 157.63s/it][AINFO 12-02 02:53:15 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 02:53:15 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 02:53:15 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 02:53:15 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+  6%|▋         | 65/1024 [2:42:00<40:42:02, 152.79s/it][A
+                                                       [A{'loss': 0.003, 'grad_norm': 0.002512057079002261, 'learning_rate': 1e-05, 'num_tokens': 48096692.0, 'completions/mean_length': 4807.765625, 'completions/min_length': 272.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 4716.6142578125, 'completions/min_terminated_length': 272.0, 'completions/max_terminated_length': 15624.0, 'rewards/accuracy_reward/mean': 0.3671875, 'rewards/accuracy_reward/std': 0.4839322865009308, 'reward': 0.3671875, 'reward_std': 0.3435155153274536, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.01960371434688568, 'sampling/sampling_logp_difference/max': 11.374892234802246, 'sampling/importance_sampling_ratio/min': 1.1480136890895665e-05, 'sampling/importance_sampling_ratio/mean': 0.9999058842658997, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.045751042664051, 'clip_ratio/low_mean': 4.339240456374682e-05, 'clip_ratio/low_min': 4.491233084991109e-06, 'clip_ratio/high_mean': 3.0520259315380827e-06, 'clip_ratio/high_max': 1.220810372615233e-05, 'clip_ratio/region_mean': 4.644443038159807e-05, 'epoch': 0.06}
+
+  6%|▋         | 65/1024 [2:42:00<40:42:02, 152.79s/it][AINFO 12-02 02:55:33 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 02:55:33 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 02:55:33 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 02:55:33 [block_pool.py:292] Successfully reset prefix cache
+
+  6%|▋         | 66/1024 [2:45:01<42:53:15, 161.16s/it][A
+                                                       [A{'loss': -0.024, 'grad_norm': 0.0016389708034694195, 'learning_rate': 1e-05, 'num_tokens': 48974399.0, 'completions/mean_length': 6703.8359375, 'completions/min_length': 813.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 6471.51220703125, 'completions/min_terminated_length': 813.0, 'completions/max_terminated_length': 16264.0, 'rewards/accuracy_reward/mean': 0.2734375, 'rewards/accuracy_reward/std': 0.447474867105484, 'reward': 0.2734375, 'reward_std': 0.2585548758506775, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.020880095660686493, 'sampling/sampling_logp_difference/max': 11.8125, 'sampling/importance_sampling_ratio/min': 7.4113349910476245e-06, 'sampling/importance_sampling_ratio/mean': 0.9999353885650635, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.0592866837978363, 'clip_ratio/low_mean': 3.0161771633174794e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 1.344853046703065e-06, 'clip_ratio/high_max': 5.37941218681226e-06, 'clip_ratio/region_mean': 3.1506624850408116e-05, 'epoch': 0.06}
+
+  6%|▋         | 66/1024 [2:45:01<42:53:15, 161.16s/it][AINFO 12-02 02:58:34 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 02:58:34 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 02:58:34 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 02:58:34 [block_pool.py:292] Successfully reset prefix cache
+
+  7%|▋         | 67/1024 [2:47:34<42:11:12, 158.70s/it][A
+                                                       [A{'loss': 0.0448, 'grad_norm': 0.003946912474930286, 'learning_rate': 1e-05, 'num_tokens': 49779920.0, 'completions/mean_length': 6140.5078125, 'completions/min_length': 462.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0390625, 'completions/mean_terminated_length': 5724.10546875, 'completions/min_terminated_length': 462.0, 'completions/max_terminated_length': 16295.0, 'rewards/accuracy_reward/mean': 0.34375, 'rewards/accuracy_reward/std': 0.47682511806488037, 'reward': 0.34375, 'reward_std': 0.36796674132347107, 'frac_reward_zero_std': 0.1875, 'sampling/sampling_logp_difference/mean': 0.021355850622057915, 'sampling/sampling_logp_difference/max': 15.070974349975586, 'sampling/importance_sampling_ratio/min': 2.849436668839189e-07, 'sampling/importance_sampling_ratio/mean': 0.9999687671661377, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.0998501181602478, 'clip_ratio/low_mean': 4.470584758564655e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 1.7734001289682055e-06, 'clip_ratio/high_max': 7.093600515872822e-06, 'clip_ratio/region_mean': 4.647924811251869e-05, 'epoch': 0.06}
+
+  7%|▋         | 67/1024 [2:47:34<42:11:12, 158.70s/it][AINFO 12-02 03:01:07 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 03:01:07 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 03:01:07 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 03:01:07 [block_pool.py:292] Successfully reset prefix cache
+
+  7%|▋         | 68/1024 [2:50:46<44:51:13, 168.91s/it][A
+                                                       [A{'loss': 0.0283, 'grad_norm': 0.0021656695753335953, 'learning_rate': 1e-05, 'num_tokens': 50655023.0, 'completions/mean_length': 6689.8046875, 'completions/min_length': 422.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.046875, 'completions/mean_terminated_length': 6213.04052734375, 'completions/min_terminated_length': 422.0, 'completions/max_terminated_length': 16352.0, 'rewards/accuracy_reward/mean': 0.203125, 'rewards/accuracy_reward/std': 0.40390563011169434, 'reward': 0.203125, 'reward_std': 0.21723884344100952, 'frac_reward_zero_std': 0.5625, 'sampling/sampling_logp_difference/mean': 0.01873670145869255, 'sampling/sampling_logp_difference/max': 12.772989273071289, 'sampling/importance_sampling_ratio/min': 2.836359499269747e-06, 'sampling/importance_sampling_ratio/mean': 0.999941885471344, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8561654165387154, 'clip_ratio/low_mean': 3.313956779038563e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 0.0, 'clip_ratio/high_max': 0.0, 'clip_ratio/region_mean': 3.313956779038563e-05, 'epoch': 0.06}
+
+  7%|▋         | 68/1024 [2:50:46<44:51:13, 168.91s/it][AINFO 12-02 03:04:19 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 03:04:19 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 03:04:19 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 03:04:19 [block_pool.py:292] Successfully reset prefix cache
+
+  7%|▋         | 69/1024 [2:52:59<41:55:33, 158.05s/it][A
+                                                       [A{'loss': 0.0397, 'grad_norm': 0.004121148493140936, 'learning_rate': 1e-05, 'num_tokens': 51406536.0, 'completions/mean_length': 5707.0078125, 'completions/min_length': 625.0, 'completions/max_length': 14330.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 5707.0078125, 'completions/min_terminated_length': 625.0, 'completions/max_terminated_length': 14330.0, 'rewards/accuracy_reward/mean': 0.3125, 'rewards/accuracy_reward/std': 0.4653336703777313, 'reward': 0.3125, 'reward_std': 0.3066929578781128, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.020000409334897995, 'sampling/sampling_logp_difference/max': 7.562357425689697, 'sampling/importance_sampling_ratio/min': 0.0005196487763896585, 'sampling/importance_sampling_ratio/mean': 0.9999328851699829, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.1396166533231735, 'clip_ratio/low_mean': 3.896083626386826e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 7.242933975248889e-06, 'clip_ratio/high_max': 2.3421607693308033e-05, 'clip_ratio/region_mean': 4.620377103492501e-05, 'epoch': 0.06}
+
+  7%|▋         | 69/1024 [2:52:59<41:55:33, 158.05s/it][AINFO 12-02 03:06:32 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 03:06:32 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 03:06:32 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 03:06:32 [block_pool.py:292] Successfully reset prefix cache
+
+  7%|▋         | 70/1024 [2:55:26<41:01:24, 154.81s/it][A
+                                                       [A{'loss': 0.0627, 'grad_norm': 0.0032538517843931913, 'learning_rate': 1e-05, 'num_tokens': 52148473.0, 'completions/mean_length': 5655.6328125, 'completions/min_length': 157.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 5571.1572265625, 'completions/min_terminated_length': 157.0, 'completions/max_terminated_length': 15987.0, 'rewards/accuracy_reward/mean': 0.3984375, 'rewards/accuracy_reward/std': 0.4915000796318054, 'reward': 0.3984375, 'reward_std': 0.29432642459869385, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.018881790339946747, 'sampling/sampling_logp_difference/max': 6.343922138214111, 'sampling/importance_sampling_ratio/min': 0.0017573959194123745, 'sampling/importance_sampling_ratio/mean': 1.0000033378601074, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8928132206201553, 'clip_ratio/low_mean': 2.5275351731579576e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 4.557263309834525e-06, 'clip_ratio/high_max': 1.82290532393381e-05, 'clip_ratio/region_mean': 2.9832615496161452e-05, 'epoch': 0.06}
+
+  7%|▋         | 70/1024 [2:55:26<41:01:24, 154.81s/it][AINFO 12-02 03:08:59 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 03:08:59 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 03:08:59 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 03:08:59 [block_pool.py:292] Successfully reset prefix cache
+
+  7%|▋         | 71/1024 [2:58:36<43:44:49, 165.26s/it][A
+                                                       [A{'loss': 0.0191, 'grad_norm': 0.002061733277514577, 'learning_rate': 1e-05, 'num_tokens': 53113230.0, 'completions/mean_length': 7399.7890625, 'completions/min_length': 144.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0390625, 'completions/mean_terminated_length': 7034.5771484375, 'completions/min_terminated_length': 144.0, 'completions/max_terminated_length': 16323.0, 'rewards/accuracy_reward/mean': 0.3046875, 'rewards/accuracy_reward/std': 0.46208351850509644, 'reward': 0.3046875, 'reward_std': 0.2580180764198303, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.018456293269991875, 'sampling/sampling_logp_difference/max': 5.243195056915283, 'sampling/importance_sampling_ratio/min': 0.005283349193632603, 'sampling/importance_sampling_ratio/mean': 0.9999673962593079, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8808257132768631, 'clip_ratio/low_mean': 3.8109637216621195e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 3.209005626558792e-06, 'clip_ratio/high_max': 1.2836022506235167e-05, 'clip_ratio/region_mean': 4.131864307055366e-05, 'epoch': 0.07}
+
+  7%|▋         | 71/1024 [2:58:36<43:44:49, 165.26s/it][AINFO 12-02 03:12:09 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 03:12:09 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 03:12:09 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 03:12:09 [block_pool.py:292] Successfully reset prefix cache
+
+  7%|▋         | 72/1024 [3:01:18<43:24:49, 164.17s/it][A
+                                                       [A{'loss': 0.0208, 'grad_norm': 0.0012298432411625981, 'learning_rate': 1e-05, 'num_tokens': 53864049.0, 'completions/mean_length': 5730.9609375, 'completions/min_length': 433.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 5475.2880859375, 'completions/min_terminated_length': 433.0, 'completions/max_terminated_length': 16208.0, 'rewards/accuracy_reward/mean': 0.359375, 'rewards/accuracy_reward/std': 0.481702595949173, 'reward': 0.359375, 'reward_std': 0.25460314750671387, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.01919996738433838, 'sampling/sampling_logp_difference/max': 9.937495231628418, 'sampling/importance_sampling_ratio/min': 4.832820559386164e-05, 'sampling/importance_sampling_ratio/mean': 0.9999348521232605, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9486126750707626, 'clip_ratio/low_mean': 3.610486896832299e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 4.739466817227367e-06, 'clip_ratio/high_max': 1.5806871488166507e-05, 'clip_ratio/region_mean': 4.084433521711617e-05, 'epoch': 0.07}
+
+  7%|▋         | 72/1024 [3:01:18<43:24:49, 164.17s/it][AINFO 12-02 03:14:50 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 03:14:50 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 03:14:51 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 03:14:51 [block_pool.py:292] Successfully reset prefix cache
+
+  7%|▋         | 73/1024 [3:03:33<41:02:51, 155.39s/it][A
+                                                       [A{'loss': 0.0609, 'grad_norm': 0.002882040338590741, 'learning_rate': 1e-05, 'num_tokens': 54473498.0, 'completions/mean_length': 4620.5703125, 'completions/min_length': 364.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 4527.94482421875, 'completions/min_terminated_length': 364.0, 'completions/max_terminated_length': 13500.0, 'rewards/accuracy_reward/mean': 0.3984375, 'rewards/accuracy_reward/std': 0.4915000796318054, 'reward': 0.3984375, 'reward_std': 0.39294686913490295, 'frac_reward_zero_std': 0.1875, 'sampling/sampling_logp_difference/mean': 0.018525000661611557, 'sampling/sampling_logp_difference/max': 15.662503242492676, 'sampling/importance_sampling_ratio/min': 1.577107298089686e-07, 'sampling/importance_sampling_ratio/mean': 0.9998915195465088, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9557560831308365, 'clip_ratio/low_mean': 3.8867822581778455e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 3.097533749496506e-06, 'clip_ratio/high_max': 1.2390134997986024e-05, 'clip_ratio/region_mean': 4.19653564449618e-05, 'epoch': 0.07}
+
+  7%|▋         | 73/1024 [3:03:33<41:02:51, 155.39s/it][AINFO 12-02 03:17:05 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 03:17:05 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 03:17:05 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 03:17:05 [block_pool.py:292] Successfully reset prefix cache
+
+  7%|▋         | 74/1024 [3:06:02<40:33:40, 153.71s/it][A
+                                                       [A{'loss': 0.0305, 'grad_norm': 0.003324020653963089, 'learning_rate': 1e-05, 'num_tokens': 55141787.0, 'completions/mean_length': 5074.0703125, 'completions/min_length': 342.0, 'completions/max_length': 16314.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 5074.0703125, 'completions/min_terminated_length': 342.0, 'completions/max_terminated_length': 16314.0, 'rewards/accuracy_reward/mean': 0.4609375, 'rewards/accuracy_reward/std': 0.5004304051399231, 'reward': 0.4609375, 'reward_std': 0.30115634202957153, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.018072880804538727, 'sampling/sampling_logp_difference/max': 6.920147895812988, 'sampling/importance_sampling_ratio/min': 0.0009876838885247707, 'sampling/importance_sampling_ratio/mean': 0.9999203681945801, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8830869868397713, 'clip_ratio/low_mean': 3.088819471486204e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 0.0, 'clip_ratio/high_max': 0.0, 'clip_ratio/region_mean': 3.088819471486204e-05, 'epoch': 0.07}
+
+  7%|▋         | 74/1024 [3:06:02<40:33:40, 153.71s/it][AINFO 12-02 03:19:35 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 03:19:35 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 03:19:35 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 03:19:35 [block_pool.py:292] Successfully reset prefix cache
+[OpenTinker] 2025-12-02 03:21:05,791 - math_verify.grader - WARNING - Timeout during comparison
+
+  7%|▋         | 75/1024 [3:08:25<39:39:59, 150.47s/it][A
+                                                       [A{'loss': 0.0216, 'grad_norm': 0.0017452294705435634, 'learning_rate': 1e-05, 'num_tokens': 55954144.0, 'completions/mean_length': 6192.1015625, 'completions/min_length': 553.0, 'completions/max_length': 15251.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 6192.1015625, 'completions/min_terminated_length': 553.0, 'completions/max_terminated_length': 15251.0, 'rewards/accuracy_reward/mean': 0.2890625, 'rewards/accuracy_reward/std': 0.45510825514793396, 'reward': 0.2890625, 'reward_std': 0.23250606656074524, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.021221645176410675, 'sampling/sampling_logp_difference/max': 14.496349334716797, 'sampling/importance_sampling_ratio/min': 5.061922365712235e-07, 'sampling/importance_sampling_ratio/mean': 0.9999473690986633, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.0888547226786613, 'clip_ratio/low_mean': 1.526649884908693e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 0.0, 'clip_ratio/high_max': 0.0, 'clip_ratio/region_mean': 1.526649884908693e-05, 'epoch': 0.07}
+
+  7%|▋         | 75/1024 [3:08:25<39:39:59, 150.47s/it][AINFO 12-02 03:21:58 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 03:21:58 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 03:21:58 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 03:21:58 [block_pool.py:292] Successfully reset prefix cache
+
+  7%|▋         | 76/1024 [3:11:06<40:27:36, 153.65s/it][A
+                                                       [A{'loss': 0.1073, 'grad_norm': 0.0033357341308146715, 'learning_rate': 1e-05, 'num_tokens': 56765470.0, 'completions/mean_length': 6204.296875, 'completions/min_length': 294.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 6124.1416015625, 'completions/min_terminated_length': 294.0, 'completions/max_terminated_length': 15740.0, 'rewards/accuracy_reward/mean': 0.3359375, 'rewards/accuracy_reward/std': 0.47417303919792175, 'reward': 0.3359375, 'reward_std': 0.37875816226005554, 'frac_reward_zero_std': 0.125, 'sampling/sampling_logp_difference/mean': 0.01908688060939312, 'sampling/sampling_logp_difference/max': 9.994686126708984, 'sampling/importance_sampling_ratio/min': 4.564182381727733e-05, 'sampling/importance_sampling_ratio/mean': 0.99998539686203, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.0423575639724731, 'clip_ratio/low_mean': 3.340929970363504e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 5.080836899651331e-06, 'clip_ratio/high_max': 1.6768677141953958e-05, 'clip_ratio/region_mean': 3.84901372854074e-05, 'epoch': 0.07}
+
+  7%|▋         | 76/1024 [3:11:06<40:27:36, 153.65s/it][AINFO 12-02 03:24:39 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 03:24:39 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 03:24:39 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 03:24:39 [block_pool.py:292] Successfully reset prefix cache
+
+  8%|▊         | 77/1024 [3:13:15<38:25:41, 146.08s/it][A
+                                                       [A{'loss': 0.0015, 'grad_norm': 0.0038695367984473705, 'learning_rate': 1e-05, 'num_tokens': 57432958.0, 'completions/mean_length': 5070.3125, 'completions/min_length': 629.0, 'completions/max_length': 14409.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 5070.3125, 'completions/min_terminated_length': 629.0, 'completions/max_terminated_length': 14409.0, 'rewards/accuracy_reward/mean': 0.390625, 'rewards/accuracy_reward/std': 0.4898075461387634, 'reward': 0.390625, 'reward_std': 0.22119548916816711, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.01970684342086315, 'sampling/sampling_logp_difference/max': 13.376652717590332, 'sampling/importance_sampling_ratio/min': 1.5509348259001854e-06, 'sampling/importance_sampling_ratio/mean': 0.9999223947525024, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.0737399458885193, 'clip_ratio/low_mean': 2.430614893000893e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 7.874710377109295e-07, 'clip_ratio/high_max': 3.149884150843718e-06, 'clip_ratio/region_mean': 2.509361991087644e-05, 'epoch': 0.07}
+
+  8%|▊         | 77/1024 [3:13:15<38:25:41, 146.08s/it][AINFO 12-02 03:26:48 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 03:26:48 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 03:26:48 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 03:26:48 [block_pool.py:292] Successfully reset prefix cache
+
+  8%|▊         | 78/1024 [3:15:57<39:40:54, 151.01s/it][A
+                                                       [A{'loss': 0.0166, 'grad_norm': 0.002927646040916443, 'learning_rate': 1e-05, 'num_tokens': 58187426.0, 'completions/mean_length': 5750.21875, 'completions/min_length': 76.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 5495.00830078125, 'completions/min_terminated_length': 76.0, 'completions/max_terminated_length': 15799.0, 'rewards/accuracy_reward/mean': 0.296875, 'rewards/accuracy_reward/std': 0.45867621898651123, 'reward': 0.296875, 'reward_std': 0.25460314750671387, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.019483914598822594, 'sampling/sampling_logp_difference/max': 4.186156272888184, 'sampling/importance_sampling_ratio/min': 0.015204614959657192, 'sampling/importance_sampling_ratio/mean': 0.9999390840530396, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9708107560873032, 'clip_ratio/low_mean': 2.9055729555693688e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 4.955485110258451e-06, 'clip_ratio/high_max': 1.9821940441033803e-05, 'clip_ratio/region_mean': 3.401121466595214e-05, 'epoch': 0.07}
+
+  8%|▊         | 78/1024 [3:15:57<39:40:54, 151.01s/it][AINFO 12-02 03:29:30 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 03:29:30 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 03:29:30 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 03:29:30 [block_pool.py:292] Successfully reset prefix cache
+
+  8%|▊         | 79/1024 [3:18:32<39:55:31, 152.10s/it][A
+                                                       [A{'loss': -0.0055, 'grad_norm': 0.0026169484481215477, 'learning_rate': 1e-05, 'num_tokens': 58983336.0, 'completions/mean_length': 6067.484375, 'completions/min_length': 656.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 5986.251953125, 'completions/min_terminated_length': 656.0, 'completions/max_terminated_length': 15938.0, 'rewards/accuracy_reward/mean': 0.390625, 'rewards/accuracy_reward/std': 0.4898075461387634, 'reward': 0.390625, 'reward_std': 0.3406373858451843, 'frac_reward_zero_std': 0.1875, 'sampling/sampling_logp_difference/mean': 0.019007554277777672, 'sampling/sampling_logp_difference/max': 13.135087013244629, 'sampling/importance_sampling_ratio/min': 1.974713995878119e-06, 'sampling/importance_sampling_ratio/mean': 0.9999620914459229, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9576351121068001, 'clip_ratio/low_mean': 4.989707144886779e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 5.953909180789196e-06, 'clip_ratio/high_max': 2.3815636723156786e-05, 'clip_ratio/region_mean': 5.585097960647545e-05, 'epoch': 0.07}
+
+  8%|▊         | 79/1024 [3:18:32<39:55:31, 152.10s/it][AINFO 12-02 03:32:05 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 03:32:05 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 03:32:05 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 03:32:05 [block_pool.py:292] Successfully reset prefix cache
+
+  8%|▊         | 80/1024 [3:20:31<37:18:12, 142.26s/it][A
+                                                       [A{'loss': -0.018, 'grad_norm': 0.0015429699560627341, 'learning_rate': 1e-05, 'num_tokens': 59590763.0, 'completions/mean_length': 4612.8984375, 'completions/min_length': 198.0, 'completions/max_length': 13640.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 4612.8984375, 'completions/min_terminated_length': 198.0, 'completions/max_terminated_length': 13640.0, 'rewards/accuracy_reward/mean': 0.421875, 'rewards/accuracy_reward/std': 0.4957992732524872, 'reward': 0.421875, 'reward_std': 0.34139877557754517, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.019313856959342957, 'sampling/sampling_logp_difference/max': 17.468652725219727, 'sampling/importance_sampling_ratio/min': 2.5909587364481013e-08, 'sampling/importance_sampling_ratio/mean': 0.9999473094940186, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9636320173740387, 'clip_ratio/low_mean': 4.5700241571466904e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 7.786730066072778e-06, 'clip_ratio/high_max': 2.4238934656750644e-05, 'clip_ratio/region_mean': 5.3486972547034384e-05, 'epoch': 0.07}
+
+  8%|▊         | 80/1024 [3:20:31<37:18:12, 142.26s/it][AINFO 12-02 03:34:04 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 03:34:04 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 03:34:04 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 03:34:04 [block_pool.py:292] Successfully reset prefix cache
+
+  8%|▊         | 81/1024 [3:23:07<38:17:35, 146.19s/it][A
+                                                       [A{'loss': 0.0813, 'grad_norm': 0.0022505265660583973, 'learning_rate': 1e-05, 'num_tokens': 60391283.0, 'completions/mean_length': 6101.3125, 'completions/min_length': 179.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 5854.5283203125, 'completions/min_terminated_length': 179.0, 'completions/max_terminated_length': 16300.0, 'rewards/accuracy_reward/mean': 0.3125, 'rewards/accuracy_reward/std': 0.4653336703777313, 'reward': 0.3125, 'reward_std': 0.29302334785461426, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.018377842381596565, 'sampling/sampling_logp_difference/max': 7.871047496795654, 'sampling/importance_sampling_ratio/min': 0.0003816343960352242, 'sampling/importance_sampling_ratio/mean': 1.0000061988830566, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8831139355897903, 'clip_ratio/low_mean': 3.0911465842109465e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 0.0, 'clip_ratio/high_max': 0.0, 'clip_ratio/region_mean': 3.0911465842109465e-05, 'epoch': 0.07}
+
+  8%|▊         | 81/1024 [3:23:07<38:17:35, 146.19s/it][AINFO 12-02 03:36:39 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 03:36:39 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 03:36:39 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 03:36:39 [block_pool.py:292] Successfully reset prefix cache
+
+  8%|▊         | 82/1024 [3:25:26<37:42:19, 144.10s/it][A
+                                                       [A{'loss': -0.0015, 'grad_norm': 0.002069958718493581, 'learning_rate': 1e-05, 'num_tokens': 61021490.0, 'completions/mean_length': 4705.9921875, 'completions/min_length': 86.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 4614.03955078125, 'completions/min_terminated_length': 86.0, 'completions/max_terminated_length': 14862.0, 'rewards/accuracy_reward/mean': 0.4296875, 'rewards/accuracy_reward/std': 0.4969765841960907, 'reward': 0.4296875, 'reward_std': 0.2637920379638672, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.018629569560289383, 'sampling/sampling_logp_difference/max': 10.495259284973145, 'sampling/importance_sampling_ratio/min': 2.76673017651774e-05, 'sampling/importance_sampling_ratio/mean': 0.9999030232429504, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9557913094758987, 'clip_ratio/low_mean': 2.478705800967873e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 3.869016609314713e-06, 'clip_ratio/high_max': 1.547606643725885e-05, 'clip_ratio/region_mean': 2.8656074391619768e-05, 'epoch': 0.08}
+
+  8%|▊         | 82/1024 [3:25:26<37:42:19, 144.10s/it][AINFO 12-02 03:38:59 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 03:38:59 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 03:38:59 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 03:38:59 [block_pool.py:292] Successfully reset prefix cache
+
+  8%|▊         | 83/1024 [3:27:39<36:47:00, 140.72s/it][A
+                                                       [A{'loss': -0.0073, 'grad_norm': 0.0034461067989468575, 'learning_rate': 1e-05, 'num_tokens': 61695382.0, 'completions/mean_length': 5116.78125, 'completions/min_length': 12.0, 'completions/max_length': 13745.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 5116.78125, 'completions/min_terminated_length': 12.0, 'completions/max_terminated_length': 13745.0, 'rewards/accuracy_reward/mean': 0.265625, 'rewards/accuracy_reward/std': 0.44340085983276367, 'reward': 0.265625, 'reward_std': 0.30774885416030884, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.019400250166654587, 'sampling/sampling_logp_difference/max': 4.4040913581848145, 'sampling/importance_sampling_ratio/min': 0.012227212078869343, 'sampling/importance_sampling_ratio/mean': 0.9999936819076538, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.0198405236005783, 'clip_ratio/low_mean': 1.952954164607945e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 5.2276211590651656e-06, 'clip_ratio/high_max': 2.0910484636260662e-05, 'clip_ratio/region_mean': 2.4757162805144617e-05, 'epoch': 0.08}
+
+  8%|▊         | 83/1024 [3:27:39<36:47:00, 140.72s/it][AINFO 12-02 03:41:11 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 03:41:11 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 03:41:11 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 03:41:11 [block_pool.py:292] Successfully reset prefix cache
+
+  8%|▊         | 84/1024 [3:30:21<38:27:04, 147.26s/it][A
+                                                       [A{'loss': 0.1013, 'grad_norm': 0.0025854657869786024, 'learning_rate': 1e-05, 'num_tokens': 62474883.0, 'completions/mean_length': 5891.9140625, 'completions/min_length': 79.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 5553.45947265625, 'completions/min_terminated_length': 79.0, 'completions/max_terminated_length': 15953.0, 'rewards/accuracy_reward/mean': 0.3203125, 'rewards/accuracy_reward/std': 0.4684300124645233, 'reward': 0.3203125, 'reward_std': 0.31010788679122925, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.019574139267206192, 'sampling/sampling_logp_difference/max': 6.497470378875732, 'sampling/importance_sampling_ratio/min': 0.0015072470996528864, 'sampling/importance_sampling_ratio/mean': 1.0001013278961182, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9568078517913818, 'clip_ratio/low_mean': 3.150914017169271e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 3.835057100332051e-06, 'clip_ratio/high_max': 1.5340228401328204e-05, 'clip_ratio/region_mean': 3.534419727202476e-05, 'epoch': 0.08}
+
+  8%|▊         | 84/1024 [3:30:21<38:27:04, 147.26s/it][AINFO 12-02 03:43:54 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 03:43:54 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 03:43:54 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 03:43:54 [block_pool.py:292] Successfully reset prefix cache
+
+  8%|▊         | 85/1024 [3:32:43<37:58:07, 145.57s/it][A
+                                                       [A{'loss': 0.0312, 'grad_norm': 0.0025075653102248907, 'learning_rate': 1e-05, 'num_tokens': 63172454.0, 'completions/mean_length': 5300.3359375, 'completions/min_length': 17.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 5213.06298828125, 'completions/min_terminated_length': 17.0, 'completions/max_terminated_length': 13671.0, 'rewards/accuracy_reward/mean': 0.203125, 'rewards/accuracy_reward/std': 0.40390563011169434, 'reward': 0.203125, 'reward_std': 0.21542152762413025, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.02002432942390442, 'sampling/sampling_logp_difference/max': 8.499993324279785, 'sampling/importance_sampling_ratio/min': 0.00020346972451079637, 'sampling/importance_sampling_ratio/mean': 0.9999643564224243, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9722280204296112, 'clip_ratio/low_mean': 2.2325777763398946e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 2.7707585559255676e-06, 'clip_ratio/high_max': 1.108303422370227e-05, 'clip_ratio/region_mean': 2.5096536319324514e-05, 'epoch': 0.08}
+
+  8%|▊         | 85/1024 [3:32:43<37:58:07, 145.57s/it][AINFO 12-02 03:46:16 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 03:46:16 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 03:46:16 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 03:46:16 [block_pool.py:292] Successfully reset prefix cache
+
+  8%|▊         | 86/1024 [3:35:30<39:36:28, 152.01s/it][A
+                                                       [A{'loss': -0.0117, 'grad_norm': 0.0017982006538659334, 'learning_rate': 1e-05, 'num_tokens': 64007602.0, 'completions/mean_length': 6364.21875, 'completions/min_length': 215.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 6205.1748046875, 'completions/min_terminated_length': 215.0, 'completions/max_terminated_length': 15617.0, 'rewards/accuracy_reward/mean': 0.2890625, 'rewards/accuracy_reward/std': 0.45510825514793396, 'reward': 0.2890625, 'reward_std': 0.21778053045272827, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.020373597741127014, 'sampling/sampling_logp_difference/max': 10.171680450439453, 'sampling/importance_sampling_ratio/min': 3.823801307589747e-05, 'sampling/importance_sampling_ratio/mean': 1.0000064373016357, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.0607495978474617, 'clip_ratio/low_mean': 4.893367201930232e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 3.4979869951712317e-06, 'clip_ratio/high_max': 1.3991947980684927e-05, 'clip_ratio/region_mean': 5.243165958290774e-05, 'epoch': 0.08}
+
+  8%|▊         | 86/1024 [3:35:30<39:36:28, 152.01s/it][AINFO 12-02 03:49:03 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 03:49:03 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 03:49:03 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 03:49:03 [block_pool.py:292] Successfully reset prefix cache
+
+  8%|▊         | 87/1024 [3:38:00<39:27:11, 151.58s/it][A
+                                                       [A{'loss': 0.063, 'grad_norm': 0.002207317156717181, 'learning_rate': 1e-05, 'num_tokens': 64762058.0, 'completions/mean_length': 5746.3125, 'completions/min_length': 287.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 5403.1611328125, 'completions/min_terminated_length': 287.0, 'completions/max_terminated_length': 14709.0, 'rewards/accuracy_reward/mean': 0.34375, 'rewards/accuracy_reward/std': 0.47682511806488037, 'reward': 0.34375, 'reward_std': 0.3264310359954834, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.020608089864253998, 'sampling/sampling_logp_difference/max': 16.744617462158203, 'sampling/importance_sampling_ratio/min': 5.3444750847120304e-08, 'sampling/importance_sampling_ratio/mean': 0.9999239444732666, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9913106113672256, 'clip_ratio/low_mean': 2.6416430046083406e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 0.0, 'clip_ratio/high_max': 0.0, 'clip_ratio/region_mean': 2.6416430046083406e-05, 'epoch': 0.08}
+
+  8%|▊         | 87/1024 [3:38:00<39:27:11, 151.58s/it][AINFO 12-02 03:51:33 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 03:51:33 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 03:51:33 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 03:51:33 [block_pool.py:292] Successfully reset prefix cache
+
+  9%|▊         | 88/1024 [3:40:39<39:56:18, 153.61s/it][A
+                                                       [A{'loss': 0.0401, 'grad_norm': 0.0012974507408216596, 'learning_rate': 1e-05, 'num_tokens': 65561002.0, 'completions/mean_length': 6088.5625, 'completions/min_length': 159.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 5841.47216796875, 'completions/min_terminated_length': 159.0, 'completions/max_terminated_length': 16292.0, 'rewards/accuracy_reward/mean': 0.3671875, 'rewards/accuracy_reward/std': 0.4839322865009308, 'reward': 0.3671875, 'reward_std': 0.2477683573961258, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.01939838007092476, 'sampling/sampling_logp_difference/max': 12.020174026489258, 'sampling/importance_sampling_ratio/min': 6.021501121722395e-06, 'sampling/importance_sampling_ratio/mean': 0.9998487234115601, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9040444120764732, 'clip_ratio/low_mean': 3.541917828897567e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 3.1704154253020533e-06, 'clip_ratio/high_max': 1.2681661701208213e-05, 'clip_ratio/region_mean': 3.85895939416514e-05, 'epoch': 0.08}
+
+  9%|▊         | 88/1024 [3:40:39<39:56:18, 153.61s/it][AINFO 12-02 03:54:12 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 03:54:12 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 03:54:12 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 03:54:12 [block_pool.py:292] Successfully reset prefix cache
+
+  9%|▊         | 89/1024 [3:43:15<40:07:45, 154.51s/it][A
+                                                       [A{'loss': 0.0522, 'grad_norm': 0.0013380619930103421, 'learning_rate': 1e-05, 'num_tokens': 66318482.0, 'completions/mean_length': 5765.5, 'completions/min_length': 412.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 5510.65625, 'completions/min_terminated_length': 412.0, 'completions/max_terminated_length': 15021.0, 'rewards/accuracy_reward/mean': 0.375, 'rewards/accuracy_reward/std': 0.4860251843929291, 'reward': 0.375, 'reward_std': 0.13994136452674866, 'frac_reward_zero_std': 0.6875, 'sampling/sampling_logp_difference/mean': 0.018109245225787163, 'sampling/sampling_logp_difference/max': 11.829224586486816, 'sampling/importance_sampling_ratio/min': 7.288413598871557e-06, 'sampling/importance_sampling_ratio/mean': 0.9999471306800842, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9966336265206337, 'clip_ratio/low_mean': 1.8564539345788944e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 1.9517831333359936e-06, 'clip_ratio/high_max': 7.807132533343975e-06, 'clip_ratio/region_mean': 2.05163223654381e-05, 'epoch': 0.08}
+
+  9%|▊         | 89/1024 [3:43:15<40:07:45, 154.51s/it][AINFO 12-02 03:56:48 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 03:56:48 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 03:56:48 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 03:56:48 [block_pool.py:292] Successfully reset prefix cache
+
+  9%|▉         | 90/1024 [3:45:44<39:40:00, 152.89s/it][A
+                                                       [A{'loss': 0.0475, 'grad_norm': 0.0023930128663778305, 'learning_rate': 1e-05, 'num_tokens': 67038582.0, 'completions/mean_length': 5462.78125, 'completions/min_length': 460.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 5200.67236328125, 'completions/min_terminated_length': 460.0, 'completions/max_terminated_length': 16120.0, 'rewards/accuracy_reward/mean': 0.46875, 'rewards/accuracy_reward/std': 0.5009832978248596, 'reward': 0.46875, 'reward_std': 0.24435341358184814, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.019220296293497086, 'sampling/sampling_logp_difference/max': 4.7666497230529785, 'sampling/importance_sampling_ratio/min': 0.008508839644491673, 'sampling/importance_sampling_ratio/mean': 0.9999513030052185, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9345141425728798, 'clip_ratio/low_mean': 2.5812531305291486e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 4.476728122426721e-06, 'clip_ratio/high_max': 1.7906912489706883e-05, 'clip_ratio/region_mean': 3.0289259655091882e-05, 'epoch': 0.08}
+
+  9%|▉         | 90/1024 [3:45:44<39:40:00, 152.89s/it][AINFO 12-02 03:59:17 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 03:59:17 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 03:59:17 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 03:59:17 [block_pool.py:292] Successfully reset prefix cache
+
+  9%|▉         | 91/1024 [3:48:00<38:16:49, 147.71s/it][A
+                                                       [A{'loss': 0.0462, 'grad_norm': 0.0013633714988827705, 'learning_rate': 1e-05, 'num_tokens': 67774487.0, 'completions/mean_length': 5547.5078125, 'completions/min_length': 14.0, 'completions/max_length': 15034.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 5547.5078125, 'completions/min_terminated_length': 14.0, 'completions/max_terminated_length': 15034.0, 'rewards/accuracy_reward/mean': 0.203125, 'rewards/accuracy_reward/std': 0.40390563011169434, 'reward': 0.203125, 'reward_std': 0.19438527524471283, 'frac_reward_zero_std': 0.5625, 'sampling/sampling_logp_difference/mean': 0.020328814163804054, 'sampling/sampling_logp_difference/max': 11.418023109436035, 'sampling/importance_sampling_ratio/min': 1.0995515367540065e-05, 'sampling/importance_sampling_ratio/mean': 0.9999545216560364, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.0511749312281609, 'clip_ratio/low_mean': 3.239646628117043e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 3.878472796259302e-06, 'clip_ratio/high_max': 1.551389118503721e-05, 'clip_ratio/region_mean': 3.6274939645863924e-05, 'epoch': 0.08}
+
+  9%|▉         | 91/1024 [3:48:00<38:16:49, 147.71s/it][AINFO 12-02 04:01:33 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 04:01:33 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 04:01:33 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 04:01:33 [block_pool.py:292] Successfully reset prefix cache
+
+  9%|▉         | 92/1024 [3:50:40<39:09:26, 151.25s/it][A
+                                                       [A{'loss': 0.0373, 'grad_norm': 0.0036165034398436546, 'learning_rate': 1e-05, 'num_tokens': 68541660.0, 'completions/mean_length': 5835.4140625, 'completions/min_length': 384.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0390625, 'completions/mean_terminated_length': 5406.609375, 'completions/min_terminated_length': 384.0, 'completions/max_terminated_length': 14029.0, 'rewards/accuracy_reward/mean': 0.34375, 'rewards/accuracy_reward/std': 0.47682511806488037, 'reward': 0.34375, 'reward_std': 0.3584783673286438, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.020469525828957558, 'sampling/sampling_logp_difference/max': 11.562312126159668, 'sampling/importance_sampling_ratio/min': 9.518130354990717e-06, 'sampling/importance_sampling_ratio/mean': 0.9999669790267944, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.0024723336100578, 'clip_ratio/low_mean': 3.441604167164769e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 3.846247352612409e-06, 'clip_ratio/high_max': 1.5384989410449634e-05, 'clip_ratio/region_mean': 3.826228908110352e-05, 'epoch': 0.08}
+
+  9%|▉         | 92/1024 [3:50:40<39:09:26, 151.25s/it][AINFO 12-02 04:04:12 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 04:04:12 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 04:04:12 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 04:04:12 [block_pool.py:292] Successfully reset prefix cache
+
+  9%|▉         | 93/1024 [3:53:23<40:01:40, 154.78s/it][A
+                                                       [A{'loss': 0.036, 'grad_norm': 0.0022011541295796633, 'learning_rate': 1e-05, 'num_tokens': 69365418.0, 'completions/mean_length': 6252.609375, 'completions/min_length': 481.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 6172.83447265625, 'completions/min_terminated_length': 481.0, 'completions/max_terminated_length': 15695.0, 'rewards/accuracy_reward/mean': 0.3828125, 'rewards/accuracy_reward/std': 0.4879830479621887, 'reward': 0.3828125, 'reward_std': 0.32301604747772217, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.02079072594642639, 'sampling/sampling_logp_difference/max': 7.4999566078186035, 'sampling/importance_sampling_ratio/min': 0.0005531083443202078, 'sampling/importance_sampling_ratio/mean': 0.9998809099197388, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.0325519517064095, 'clip_ratio/low_mean': 5.3129634352444555e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 1.526400637885672e-06, 'clip_ratio/high_max': 6.105602551542688e-06, 'clip_ratio/region_mean': 5.46560352177039e-05, 'epoch': 0.09}
+
+  9%|▉         | 93/1024 [3:53:23<40:01:40, 154.78s/it][AINFO 12-02 04:06:55 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 04:06:55 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 04:06:55 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 04:06:55 [block_pool.py:292] Successfully reset prefix cache
+
+  9%|▉         | 94/1024 [3:55:45<39:01:17, 151.05s/it][A
+                                                       [A{'loss': -0.0007, 'grad_norm': 0.002300912281498313, 'learning_rate': 1e-05, 'num_tokens': 70099320.0, 'completions/mean_length': 5581.484375, 'completions/min_length': 461.0, 'completions/max_length': 15316.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 5581.484375, 'completions/min_terminated_length': 461.0, 'completions/max_terminated_length': 15316.0, 'rewards/accuracy_reward/mean': 0.296875, 'rewards/accuracy_reward/std': 0.45867621898651123, 'reward': 0.296875, 'reward_std': 0.2959064245223999, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.01952272653579712, 'sampling/sampling_logp_difference/max': 16.323843002319336, 'sampling/importance_sampling_ratio/min': 8.140386853483506e-08, 'sampling/importance_sampling_ratio/mean': 0.9998577833175659, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9222500994801521, 'clip_ratio/low_mean': 3.0097819148977578e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 1.0870321602851618e-06, 'clip_ratio/high_max': 4.348128641140647e-06, 'clip_ratio/region_mean': 3.118485085451539e-05, 'epoch': 0.09}
+
+  9%|▉         | 94/1024 [3:55:45<39:01:17, 151.05s/it][AINFO 12-02 04:09:18 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 04:09:18 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 04:09:18 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 04:09:18 [block_pool.py:292] Successfully reset prefix cache
+
+  9%|▉         | 95/1024 [3:58:09<38:26:07, 148.94s/it][A
+                                                       [A{'loss': 0.0119, 'grad_norm': 0.0016312639927491546, 'learning_rate': 1e-05, 'num_tokens': 70811474.0, 'completions/mean_length': 5424.140625, 'completions/min_length': 124.0, 'completions/max_length': 15781.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 5424.140625, 'completions/min_terminated_length': 124.0, 'completions/max_terminated_length': 15781.0, 'rewards/accuracy_reward/mean': 0.359375, 'rewards/accuracy_reward/std': 0.481702595949173, 'reward': 0.359375, 'reward_std': 0.31246691942214966, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.019741754978895187, 'sampling/sampling_logp_difference/max': 6.12296199798584, 'sampling/importance_sampling_ratio/min': 0.0021919538266956806, 'sampling/importance_sampling_ratio/mean': 1.0000094175338745, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.0446564108133316, 'clip_ratio/low_mean': 3.5122252029395895e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 0.0, 'clip_ratio/high_max': 0.0, 'clip_ratio/region_mean': 3.5122252029395895e-05, 'epoch': 0.09}
+
+  9%|▉         | 95/1024 [3:58:09<38:26:07, 148.94s/it][AINFO 12-02 04:11:42 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 04:11:42 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 04:11:42 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 04:11:42 [block_pool.py:292] Successfully reset prefix cache
+
+  9%|▉         | 96/1024 [4:00:31<37:51:54, 146.89s/it][A
+                                                       [A{'loss': 0.0451, 'grad_norm': 0.0032865386456251144, 'learning_rate': 1e-05, 'num_tokens': 71582701.0, 'completions/mean_length': 5884.9609375, 'completions/min_length': 382.0, 'completions/max_length': 15755.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 5884.9609375, 'completions/min_terminated_length': 382.0, 'completions/max_terminated_length': 15755.0, 'rewards/accuracy_reward/mean': 0.4140625, 'rewards/accuracy_reward/std': 0.49449479579925537, 'reward': 0.4140625, 'reward_std': 0.3514111638069153, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.019438734278082848, 'sampling/sampling_logp_difference/max': 11.373762130737305, 'sampling/importance_sampling_ratio/min': 1.149311810877407e-05, 'sampling/importance_sampling_ratio/mean': 0.9999833106994629, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9605691060423851, 'clip_ratio/low_mean': 4.096481598026003e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 3.496124691082514e-06, 'clip_ratio/high_max': 1.0354576261306647e-05, 'clip_ratio/region_mean': 4.446094089871622e-05, 'epoch': 0.09}
+
+  9%|▉         | 96/1024 [4:00:31<37:51:54, 146.89s/it][AINFO 12-02 04:14:04 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 04:14:04 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 04:14:04 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 04:14:04 [block_pool.py:292] Successfully reset prefix cache
+
+  9%|▉         | 97/1024 [4:02:51<37:15:26, 144.69s/it][A
+                                                       [A{'loss': 0.1143, 'grad_norm': 0.004721678793430328, 'learning_rate': 1e-05, 'num_tokens': 72220025.0, 'completions/mean_length': 4835.09375, 'completions/min_length': 826.0, 'completions/max_length': 15361.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 4835.09375, 'completions/min_terminated_length': 826.0, 'completions/max_terminated_length': 15361.0, 'rewards/accuracy_reward/mean': 0.4765625, 'rewards/accuracy_reward/std': 0.5014128684997559, 'reward': 0.4765625, 'reward_std': 0.38481879234313965, 'frac_reward_zero_std': 0.1875, 'sampling/sampling_logp_difference/mean': 0.017888439819216728, 'sampling/sampling_logp_difference/max': 15.12112808227539, 'sampling/importance_sampling_ratio/min': 2.710051205667696e-07, 'sampling/importance_sampling_ratio/mean': 0.99994957447052, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9038172215223312, 'clip_ratio/low_mean': 3.5440503552308655e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 2.5674950165921473e-06, 'clip_ratio/high_max': 1.026998006636859e-05, 'clip_ratio/region_mean': 3.8007998455213965e-05, 'epoch': 0.09}
+
+  9%|▉         | 97/1024 [4:02:51<37:15:26, 144.69s/it][AINFO 12-02 04:16:23 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 04:16:23 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 04:16:23 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 04:16:23 [block_pool.py:292] Successfully reset prefix cache
+
+ 10%|▉         | 98/1024 [4:05:14<37:08:58, 144.43s/it][A
+                                                       [A{'loss': 0.0187, 'grad_norm': 0.0010532280430197716, 'learning_rate': 1e-05, 'num_tokens': 73005515.0, 'completions/mean_length': 5979.078125, 'completions/min_length': 241.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 5897.1494140625, 'completions/min_terminated_length': 241.0, 'completions/max_terminated_length': 14431.0, 'rewards/accuracy_reward/mean': 0.2890625, 'rewards/accuracy_reward/std': 0.45510825514793396, 'reward': 0.2890625, 'reward_std': 0.30115631222724915, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.019633149728178978, 'sampling/sampling_logp_difference/max': 8.10648250579834, 'sampling/importance_sampling_ratio/min': 0.00030157779110595584, 'sampling/importance_sampling_ratio/mean': 0.9999090433120728, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.0227951630949974, 'clip_ratio/low_mean': 4.7865792453194445e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 9.56252398509605e-06, 'clip_ratio/high_max': 2.93432283342554e-05, 'clip_ratio/region_mean': 5.742831808674964e-05, 'epoch': 0.09}
+
+ 10%|▉         | 98/1024 [4:05:14<37:08:58, 144.43s/it][AINFO 12-02 04:18:47 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 04:18:47 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 04:18:47 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 04:18:47 [block_pool.py:292] Successfully reset prefix cache
+
+ 10%|▉         | 99/1024 [4:07:26<36:07:16, 140.58s/it][A
+                                                       [A{'loss': 0.0669, 'grad_norm': 0.0038324075285345316, 'learning_rate': 1e-05, 'num_tokens': 73572794.0, 'completions/mean_length': 4292.1796875, 'completions/min_length': 159.0, 'completions/max_length': 15886.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 4292.1796875, 'completions/min_terminated_length': 159.0, 'completions/max_terminated_length': 15886.0, 'rewards/accuracy_reward/mean': 0.4375, 'rewards/accuracy_reward/std': 0.49802759289741516, 'reward': 0.4375, 'reward_std': 0.2972046136856079, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.018074234947562218, 'sampling/sampling_logp_difference/max': 4.155638694763184, 'sampling/importance_sampling_ratio/min': 0.015675775706768036, 'sampling/importance_sampling_ratio/mean': 0.9999188780784607, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8719984591007233, 'clip_ratio/low_mean': 2.5574990331733716e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 1.050808691616112e-06, 'clip_ratio/high_max': 4.203234766464448e-06, 'clip_ratio/region_mean': 2.6625799137036665e-05, 'epoch': 0.09}
+
+ 10%|▉         | 99/1024 [4:07:26<36:07:16, 140.58s/it][AINFO 12-02 04:20:59 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 04:20:59 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 04:20:59 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 04:20:59 [block_pool.py:292] Successfully reset prefix cache
+[OpenTinker] 2025-12-02 04:22:25,001 - math_verify.grader - WARNING - Timeout during comparison
+
+ 10%|▉         | 100/1024 [4:09:45<35:55:40, 139.98s/it][A
+                                                        [A{'loss': 0.0166, 'grad_norm': 0.004843447357416153, 'learning_rate': 1e-05, 'num_tokens': 74289607.0, 'completions/mean_length': 5449.2890625, 'completions/min_length': 635.0, 'completions/max_length': 14674.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 5449.2890625, 'completions/min_terminated_length': 635.0, 'completions/max_terminated_length': 14674.0, 'rewards/accuracy_reward/mean': 0.5, 'rewards/accuracy_reward/std': 0.5019646286964417, 'reward': 0.5, 'reward_std': 0.40609243512153625, 'frac_reward_zero_std': 0.125, 'sampling/sampling_logp_difference/mean': 0.018183842301368713, 'sampling/sampling_logp_difference/max': 13.937499046325684, 'sampling/importance_sampling_ratio/min': 8.851584993863071e-07, 'sampling/importance_sampling_ratio/mean': 0.999977707862854, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9137986451387405, 'clip_ratio/low_mean': 4.433405501913512e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 1.1078417401222396e-06, 'clip_ratio/high_max': 4.431366960488958e-06, 'clip_ratio/region_mean': 4.54418968729442e-05, 'epoch': 0.09}
+
+ 10%|▉         | 100/1024 [4:09:45<35:55:40, 139.98s/it][AINFO 12-02 04:23:17 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 04:23:17 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 04:23:17 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 04:23:17 [block_pool.py:292] Successfully reset prefix cache
+
+ 10%|▉         | 101/1024 [4:12:07<36:04:11, 140.68s/it][A
+                                                        [A{'loss': 0.0463, 'grad_norm': 0.0037651765160262585, 'learning_rate': 1e-05, 'num_tokens': 74946484.0, 'completions/mean_length': 4983.3515625, 'completions/min_length': 541.0, 'completions/max_length': 16163.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 4983.3515625, 'completions/min_terminated_length': 541.0, 'completions/max_terminated_length': 16163.0, 'rewards/accuracy_reward/mean': 0.3671875, 'rewards/accuracy_reward/std': 0.4839322865009308, 'reward': 0.3671875, 'reward_std': 0.3090519309043884, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.018207306042313576, 'sampling/sampling_logp_difference/max': 9.062478065490723, 'sampling/importance_sampling_ratio/min': 0.00011593531962716952, 'sampling/importance_sampling_ratio/mean': 0.9999549984931946, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9354705810546875, 'clip_ratio/low_mean': 3.6279372466196946e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 2.0530192159640137e-06, 'clip_ratio/high_max': 8.212076863856055e-06, 'clip_ratio/region_mean': 3.833239122741361e-05, 'epoch': 0.09}
+
+ 10%|▉         | 101/1024 [4:12:07<36:04:11, 140.68s/it][AINFO 12-02 04:25:40 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 04:25:40 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 04:25:40 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 04:25:40 [block_pool.py:292] Successfully reset prefix cache
+
+ 10%|▉         | 102/1024 [4:14:55<38:07:31, 148.86s/it][A
+                                                        [A{'loss': -0.023, 'grad_norm': 0.0009160125628113747, 'learning_rate': 1e-05, 'num_tokens': 75779145.0, 'completions/mean_length': 6351.1015625, 'completions/min_length': 804.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 6027.45947265625, 'completions/min_terminated_length': 804.0, 'completions/max_terminated_length': 16039.0, 'rewards/accuracy_reward/mean': 0.3828125, 'rewards/accuracy_reward/std': 0.4879830479621887, 'reward': 0.3828125, 'reward_std': 0.24329257011413574, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.018513178452849388, 'sampling/sampling_logp_difference/max': 8.1246919631958, 'sampling/importance_sampling_ratio/min': 0.0002961359277833253, 'sampling/importance_sampling_ratio/mean': 0.9998877048492432, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9310042560100555, 'clip_ratio/low_mean': 2.544108633628639e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 3.295722081020358e-06, 'clip_ratio/high_max': 1.3182888324081432e-05, 'clip_ratio/region_mean': 2.8736808644680423e-05, 'epoch': 0.09}
+
+ 10%|▉         | 102/1024 [4:14:55<38:07:31, 148.86s/it][AINFO 12-02 04:28:28 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 04:28:28 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 04:28:28 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 04:28:28 [block_pool.py:292] Successfully reset prefix cache
+
+ 10%|█         | 103/1024 [4:17:20<37:46:00, 147.62s/it][A
+                                                        [A{'loss': 0.1021, 'grad_norm': 0.0023993055801838636, 'learning_rate': 1e-05, 'num_tokens': 76475557.0, 'completions/mean_length': 5293.40625, 'completions/min_length': 222.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 4935.64501953125, 'completions/min_terminated_length': 222.0, 'completions/max_terminated_length': 14471.0, 'rewards/accuracy_reward/mean': 0.34375, 'rewards/accuracy_reward/std': 0.47682511806488037, 'reward': 0.34375, 'reward_std': 0.2835350036621094, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.020792219787836075, 'sampling/sampling_logp_difference/max': 9.623851776123047, 'sampling/importance_sampling_ratio/min': 6.613240111619234e-05, 'sampling/importance_sampling_ratio/mean': 1.000077724456787, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.0732879787683487, 'clip_ratio/low_mean': 3.059757568735222e-05, 'clip_ratio/low_min': 4.3258582991256844e-06, 'clip_ratio/high_mean': 3.935649147024378e-06, 'clip_ratio/high_max': 1.1402620202716207e-05, 'clip_ratio/region_mean': 3.45332257438713e-05, 'epoch': 0.09}
+
+ 10%|█         | 103/1024 [4:17:20<37:46:00, 147.62s/it][AINFO 12-02 04:30:52 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 04:30:52 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 04:30:52 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 04:30:52 [block_pool.py:292] Successfully reset prefix cache
+
+ 10%|█         | 104/1024 [4:20:13<39:44:19, 155.50s/it][A
+                                                        [A{'loss': 0.0439, 'grad_norm': 0.0014872358879074454, 'learning_rate': 1e-05, 'num_tokens': 77474310.0, 'completions/mean_length': 7619.7578125, 'completions/min_length': 25.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 7409.41650390625, 'completions/min_terminated_length': 25.0, 'completions/max_terminated_length': 16305.0, 'rewards/accuracy_reward/mean': 0.34375, 'rewards/accuracy_reward/std': 0.47682511806488037, 'reward': 0.34375, 'reward_std': 0.33114904165267944, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.020074717700481415, 'sampling/sampling_logp_difference/max': 6.395720481872559, 'sampling/importance_sampling_ratio/min': 0.0016686831368133426, 'sampling/importance_sampling_ratio/mean': 0.9999638795852661, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9646238535642624, 'clip_ratio/low_mean': 2.663600798769039e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 8.929533635182452e-06, 'clip_ratio/high_max': 2.130644793396641e-05, 'clip_ratio/region_mean': 3.556554071337814e-05, 'epoch': 0.1}
+
+ 10%|█         | 104/1024 [4:20:13<39:44:19, 155.50s/it][AINFO 12-02 04:33:46 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 04:33:46 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 04:33:46 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 04:33:46 [block_pool.py:292] Successfully reset prefix cache
+
+ 10%|█         | 105/1024 [4:22:46<39:26:29, 154.50s/it][A
+                                                        [A{'loss': 0.1016, 'grad_norm': 0.00398358516395092, 'learning_rate': 1e-05, 'num_tokens': 78257132.0, 'completions/mean_length': 5959.921875, 'completions/min_length': 55.0, 'completions/max_length': 15984.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 5959.921875, 'completions/min_terminated_length': 55.0, 'completions/max_terminated_length': 15984.0, 'rewards/accuracy_reward/mean': 0.359375, 'rewards/accuracy_reward/std': 0.481702595949173, 'reward': 0.359375, 'reward_std': 0.3653082847595215, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.020755283534526825, 'sampling/sampling_logp_difference/max': 5.806637287139893, 'sampling/importance_sampling_ratio/min': 0.0030075267422944307, 'sampling/importance_sampling_ratio/mean': 1.0000170469284058, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.004471093416214, 'clip_ratio/low_mean': 5.166909659237717e-05, 'clip_ratio/low_min': 8.365680514543783e-06, 'clip_ratio/high_mean': 5.154013138053415e-06, 'clip_ratio/high_max': 1.7765815300663235e-05, 'clip_ratio/region_mean': 5.68231100714911e-05, 'epoch': 0.1}
+
+ 10%|█         | 105/1024 [4:22:46<39:26:29, 154.50s/it][AINFO 12-02 04:36:19 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 04:36:19 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 04:36:19 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 04:36:19 [block_pool.py:292] Successfully reset prefix cache
+
+ 10%|█         | 106/1024 [4:25:11<38:43:41, 151.88s/it][A
+                                                        [A{'loss': 0.01, 'grad_norm': 0.0019883522763848305, 'learning_rate': 1e-05, 'num_tokens': 78971072.0, 'completions/mean_length': 5427.03125, 'completions/min_length': 556.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 5340.755859375, 'completions/min_terminated_length': 556.0, 'completions/max_terminated_length': 13736.0, 'rewards/accuracy_reward/mean': 0.375, 'rewards/accuracy_reward/std': 0.4860251843929291, 'reward': 0.375, 'reward_std': 0.31694266200065613, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.018812140449881554, 'sampling/sampling_logp_difference/max': 7.125164031982422, 'sampling/importance_sampling_ratio/min': 0.0008046010043472052, 'sampling/importance_sampling_ratio/mean': 1.0000550746917725, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9117375314235687, 'clip_ratio/low_mean': 4.510891039899434e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 4.236738959662034e-06, 'clip_ratio/high_max': 1.6946955838648137e-05, 'clip_ratio/region_mean': 4.934564867653535e-05, 'epoch': 0.1}
+
+ 10%|█         | 106/1024 [4:25:11<38:43:41, 151.88s/it][AINFO 12-02 04:38:44 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 04:38:44 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 04:38:44 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 04:38:44 [block_pool.py:292] Successfully reset prefix cache
+
+ 10%|█         | 107/1024 [4:27:47<38:56:47, 152.90s/it][A
+                                                        [A{'loss': 0.0426, 'grad_norm': 0.00450351694598794, 'learning_rate': 1e-05, 'num_tokens': 79764434.0, 'completions/mean_length': 6062.078125, 'completions/min_length': 108.0, 'completions/max_length': 16293.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 6062.078125, 'completions/min_terminated_length': 108.0, 'completions/max_terminated_length': 16293.0, 'rewards/accuracy_reward/mean': 0.2578125, 'rewards/accuracy_reward/std': 0.43914902210235596, 'reward': 0.2578125, 'reward_std': 0.26355957984924316, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.020526543259620667, 'sampling/sampling_logp_difference/max': 7.207343101501465, 'sampling/importance_sampling_ratio/min': 0.0007411236292682588, 'sampling/importance_sampling_ratio/mean': 0.9999713897705078, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.0164100378751755, 'clip_ratio/low_mean': 3.220799408154562e-05, 'clip_ratio/low_min': 5.315981979947537e-06, 'clip_ratio/high_mean': 7.42044210255699e-06, 'clip_ratio/high_max': 2.968176841022796e-05, 'clip_ratio/region_mean': 3.962843629778945e-05, 'epoch': 0.1}
+
+ 10%|█         | 107/1024 [4:27:47<38:56:47, 152.90s/it][AINFO 12-02 04:41:20 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 04:41:20 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 04:41:20 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 04:41:20 [block_pool.py:292] Successfully reset prefix cache
+
+ 11%|█         | 108/1024 [4:29:47<36:23:59, 143.06s/it][A
+                                                        [A{'loss': 0.046, 'grad_norm': 0.0033157530706375837, 'learning_rate': 1e-05, 'num_tokens': 80405238.0, 'completions/mean_length': 4856.53125, 'completions/min_length': 191.0, 'completions/max_length': 13689.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 4856.53125, 'completions/min_terminated_length': 191.0, 'completions/max_terminated_length': 13689.0, 'rewards/accuracy_reward/mean': 0.3359375, 'rewards/accuracy_reward/std': 0.47417303919792175, 'reward': 0.3359375, 'reward_std': 0.3487703502178192, 'frac_reward_zero_std': 0.1875, 'sampling/sampling_logp_difference/mean': 0.019188418984413147, 'sampling/sampling_logp_difference/max': 3.7256407737731934, 'sampling/importance_sampling_ratio/min': 0.033773623406887054, 'sampling/importance_sampling_ratio/mean': 0.9999889135360718, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.0780886858701706, 'clip_ratio/low_mean': 4.856050622947805e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 0.0, 'clip_ratio/high_max': 0.0, 'clip_ratio/region_mean': 4.856050622947805e-05, 'epoch': 0.1}
+
+ 11%|█         | 108/1024 [4:29:47<36:23:59, 143.06s/it][AINFO 12-02 04:43:20 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 04:43:20 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 04:43:20 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 04:43:20 [block_pool.py:292] Successfully reset prefix cache
+
+ 11%|█         | 109/1024 [4:32:00<35:35:58, 140.06s/it][A
+                                                        [A{'loss': -0.0779, 'grad_norm': 0.005471619311720133, 'learning_rate': 1e-05, 'num_tokens': 80926721.0, 'completions/mean_length': 3930.5859375, 'completions/min_length': 4.0, 'completions/max_length': 16335.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 3930.5859375, 'completions/min_terminated_length': 4.0, 'completions/max_terminated_length': 16335.0, 'rewards/accuracy_reward/mean': 0.5859375, 'rewards/accuracy_reward/std': 0.49449479579925537, 'reward': 0.5859375, 'reward_std': 0.3164186179637909, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.017708823084831238, 'sampling/sampling_logp_difference/max': 8.269469261169434, 'sampling/importance_sampling_ratio/min': 0.0002562212466727942, 'sampling/importance_sampling_ratio/mean': 1.0000040531158447, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8666863515973091, 'clip_ratio/low_mean': 1.975351790406421e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 0.0, 'clip_ratio/high_max': 0.0, 'clip_ratio/region_mean': 1.975351790406421e-05, 'epoch': 0.1}
+
+ 11%|█         | 109/1024 [4:32:00<35:35:58, 140.06s/it][AINFO 12-02 04:45:33 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 04:45:33 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 04:45:33 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 04:45:33 [block_pool.py:292] Successfully reset prefix cache
+
+ 11%|█         | 110/1024 [4:34:41<37:11:14, 146.47s/it][A
+                                                        [A{'loss': -0.0065, 'grad_norm': 0.002580739092081785, 'learning_rate': 1e-05, 'num_tokens': 81707978.0, 'completions/mean_length': 5934.9453125, 'completions/min_length': 229.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 5684.16845703125, 'completions/min_terminated_length': 229.0, 'completions/max_terminated_length': 15546.0, 'rewards/accuracy_reward/mean': 0.3046875, 'rewards/accuracy_reward/std': 0.46208351850509644, 'reward': 0.3046875, 'reward_std': 0.24671243131160736, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.019801246002316475, 'sampling/sampling_logp_difference/max': 5.999995708465576, 'sampling/importance_sampling_ratio/min': 0.002478762762621045, 'sampling/importance_sampling_ratio/mean': 1.0000852346420288, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9991667941212654, 'clip_ratio/low_mean': 3.61007656692891e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 1.6859994502738118e-06, 'clip_ratio/high_max': 6.743997801095247e-06, 'clip_ratio/region_mean': 3.7786765119562915e-05, 'epoch': 0.1}
+
+ 11%|█         | 110/1024 [4:34:41<37:11:14, 146.47s/it][AINFO 12-02 04:48:14 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 04:48:14 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 04:48:14 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 04:48:14 [block_pool.py:292] Successfully reset prefix cache
+[OpenTinker] 2025-12-02 04:49:51,388 - math_verify.grader - WARNING - Timeout during comparison
+
+ 11%|█         | 111/1024 [4:37:19<38:00:12, 149.85s/it][A
+                                                        [A{'loss': 0.0266, 'grad_norm': 0.0010949905263260007, 'learning_rate': 1e-05, 'num_tokens': 82477310.0, 'completions/mean_length': 5866.84375, 'completions/min_length': 499.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 5699.9052734375, 'completions/min_terminated_length': 499.0, 'completions/max_terminated_length': 16010.0, 'rewards/accuracy_reward/mean': 0.2734375, 'rewards/accuracy_reward/std': 0.447474867105484, 'reward': 0.2734375, 'reward_std': 0.26933354139328003, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.020769795402884483, 'sampling/sampling_logp_difference/max': 9.310929298400879, 'sampling/importance_sampling_ratio/min': 9.04304688447155e-05, 'sampling/importance_sampling_ratio/mean': 0.9999667406082153, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9848997294902802, 'clip_ratio/low_mean': 2.43532002741631e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 0.0, 'clip_ratio/high_max': 0.0, 'clip_ratio/region_mean': 2.43532002741631e-05, 'epoch': 0.1}
+
+ 11%|█         | 111/1024 [4:37:19<38:00:12, 149.85s/it][AINFO 12-02 04:50:52 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 04:50:52 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 04:50:52 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 04:50:52 [block_pool.py:292] Successfully reset prefix cache
+
+ 11%|█         | 112/1024 [4:39:46<37:43:39, 148.92s/it][A
+                                                        [A{'loss': 0.0571, 'grad_norm': 0.0020384234376251698, 'learning_rate': 1e-05, 'num_tokens': 83345055.0, 'completions/mean_length': 6613.7578125, 'completions/min_length': 1033.0, 'completions/max_length': 14501.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 6613.7578125, 'completions/min_terminated_length': 1033.0, 'completions/max_terminated_length': 14501.0, 'rewards/accuracy_reward/mean': 0.3671875, 'rewards/accuracy_reward/std': 0.4839322865009308, 'reward': 0.3671875, 'reward_std': 0.2845958471298218, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.018883168697357178, 'sampling/sampling_logp_difference/max': 3.5219533443450928, 'sampling/importance_sampling_ratio/min': 0.029541675001382828, 'sampling/importance_sampling_ratio/mean': 0.9999457001686096, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9176012054085732, 'clip_ratio/low_mean': 5.842190330440644e-05, 'clip_ratio/low_min': 1.2287753634154797e-05, 'clip_ratio/high_mean': 4.826903364119062e-06, 'clip_ratio/high_max': 1.9307613456476247e-05, 'clip_ratio/region_mean': 6.324880496322294e-05, 'epoch': 0.1}
+
+ 11%|█         | 112/1024 [4:39:46<37:43:39, 148.92s/it][AINFO 12-02 04:53:19 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 04:53:19 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 04:53:19 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 04:53:19 [block_pool.py:292] Successfully reset prefix cache
+
+ 11%|█         | 113/1024 [4:42:30<38:50:32, 153.49s/it][A
+                                                        [A{'loss': 0.0351, 'grad_norm': 0.0031166900880634785, 'learning_rate': 1e-05, 'num_tokens': 84186343.0, 'completions/mean_length': 6392.3125, 'completions/min_length': 507.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 6070.0, 'completions/min_terminated_length': 507.0, 'completions/max_terminated_length': 16310.0, 'rewards/accuracy_reward/mean': 0.390625, 'rewards/accuracy_reward/std': 0.4898075461387634, 'reward': 0.390625, 'reward_std': 0.34139877557754517, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.01931958645582199, 'sampling/sampling_logp_difference/max': 8.398082733154297, 'sampling/importance_sampling_ratio/min': 0.00022529886336997151, 'sampling/importance_sampling_ratio/mean': 0.9999208450317383, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.904954232275486, 'clip_ratio/low_mean': 5.789885449303256e-05, 'clip_ratio/low_min': 1.017130716718384e-05, 'clip_ratio/high_mean': 3.455107957961445e-06, 'clip_ratio/high_max': 1.382043183184578e-05, 'clip_ratio/region_mean': 6.135396188255982e-05, 'epoch': 0.1}
+
+ 11%|█         | 113/1024 [4:42:30<38:50:32, 153.49s/it][AINFO 12-02 04:56:03 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 04:56:03 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 04:56:03 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 04:56:03 [block_pool.py:292] Successfully reset prefix cache
+
+ 11%|█         | 114/1024 [4:45:17<39:48:13, 157.47s/it][A
+                                                        [A{'loss': 0.1043, 'grad_norm': 0.0022041688207536936, 'learning_rate': 1e-05, 'num_tokens': 84971129.0, 'completions/mean_length': 5982.703125, 'completions/min_length': 294.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 5817.603515625, 'completions/min_terminated_length': 294.0, 'completions/max_terminated_length': 16180.0, 'rewards/accuracy_reward/mean': 0.3125, 'rewards/accuracy_reward/std': 0.4653336703777313, 'reward': 0.3125, 'reward_std': 0.30774885416030884, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.01795877143740654, 'sampling/sampling_logp_difference/max': 13.374939918518066, 'sampling/importance_sampling_ratio/min': 1.553593506287143e-06, 'sampling/importance_sampling_ratio/mean': 0.9999030828475952, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8394555225968361, 'clip_ratio/low_mean': 3.147818074467068e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 6.549099907715572e-06, 'clip_ratio/high_max': 1.7221671441802755e-05, 'clip_ratio/region_mean': 3.802728065238625e-05, 'epoch': 0.1}
+
+ 11%|█         | 114/1024 [4:45:17<39:48:13, 157.47s/it][AINFO 12-02 04:58:50 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 04:58:50 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 04:58:50 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 04:58:50 [block_pool.py:292] Successfully reset prefix cache
+
+ 11%|█         | 115/1024 [4:47:56<39:56:16, 158.17s/it][A
+                                                        [A{'loss': 0.1244, 'grad_norm': 0.0037972736172378063, 'learning_rate': 1e-05, 'num_tokens': 85625559.0, 'completions/mean_length': 4976.921875, 'completions/min_length': 335.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 4608.95166015625, 'completions/min_terminated_length': 335.0, 'completions/max_terminated_length': 14988.0, 'rewards/accuracy_reward/mean': 0.4765625, 'rewards/accuracy_reward/std': 0.5014128684997559, 'reward': 0.4765625, 'reward_std': 0.39082521200180054, 'frac_reward_zero_std': 0.1875, 'sampling/sampling_logp_difference/mean': 0.01815030723810196, 'sampling/sampling_logp_difference/max': 5.8122453689575195, 'sampling/importance_sampling_ratio/min': 0.002990707289427519, 'sampling/importance_sampling_ratio/mean': 0.999970555305481, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8381234556436539, 'clip_ratio/low_mean': 4.788733849636628e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 9.398806923854863e-06, 'clip_ratio/high_max': 2.9651660042873118e-05, 'clip_ratio/region_mean': 5.728614519284747e-05, 'epoch': 0.11}
+
+ 11%|█         | 115/1024 [4:47:56<39:56:16, 158.17s/it][AINFO 12-02 05:01:29 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 05:01:29 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 05:01:29 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 05:01:29 [block_pool.py:292] Successfully reset prefix cache
+
+ 11%|█▏        | 116/1024 [4:50:37<40:02:25, 158.75s/it][A
+                                                        [A{'loss': 0.0146, 'grad_norm': 0.0012413962977007031, 'learning_rate': 1e-05, 'num_tokens': 86453606.0, 'completions/mean_length': 6307.2421875, 'completions/min_length': 823.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 6065.400390625, 'completions/min_terminated_length': 823.0, 'completions/max_terminated_length': 15984.0, 'rewards/accuracy_reward/mean': 0.28125, 'rewards/accuracy_reward/std': 0.4513758420944214, 'reward': 0.28125, 'reward_std': 0.2280253767967224, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.021790307015180588, 'sampling/sampling_logp_difference/max': 5.353684425354004, 'sampling/importance_sampling_ratio/min': 0.004730688873678446, 'sampling/importance_sampling_ratio/mean': 1.0000064373016357, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.1176434755325317, 'clip_ratio/low_mean': 1.6904315600640984e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 1.0326482424716232e-06, 'clip_ratio/high_max': 4.130592969886493e-06, 'clip_ratio/region_mean': 1.7936963843112608e-05, 'epoch': 0.11}
+
+ 11%|█▏        | 116/1024 [4:50:37<40:02:25, 158.75s/it][AINFO 12-02 05:04:09 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 05:04:09 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 05:04:09 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 05:04:09 [block_pool.py:292] Successfully reset prefix cache
+
+ 11%|█▏        | 117/1024 [4:53:43<42:06:22, 167.12s/it][A
+                                                        [A{'loss': 0.0349, 'grad_norm': 0.0017762042116373777, 'learning_rate': 1e-05, 'num_tokens': 87402763.0, 'completions/mean_length': 7263.1640625, 'completions/min_length': 48.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 7044.26416015625, 'completions/min_terminated_length': 48.0, 'completions/max_terminated_length': 16329.0, 'rewards/accuracy_reward/mean': 0.2578125, 'rewards/accuracy_reward/std': 0.43914902210235596, 'reward': 0.2578125, 'reward_std': 0.27776598930358887, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.02103034406900406, 'sampling/sampling_logp_difference/max': 6.968719005584717, 'sampling/importance_sampling_ratio/min': 0.0009408573969267309, 'sampling/importance_sampling_ratio/mean': 0.9999741315841675, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.107876107096672, 'clip_ratio/low_mean': 3.582628983167524e-05, 'clip_ratio/low_min': 2.61966624748311e-06, 'clip_ratio/high_mean': 3.2901932058848615e-06, 'clip_ratio/high_max': 1.3160772823539446e-05, 'clip_ratio/region_mean': 3.911648195753514e-05, 'epoch': 0.11}
+
+ 11%|█▏        | 117/1024 [4:53:43<42:06:22, 167.12s/it][AINFO 12-02 05:07:16 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 05:07:16 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 05:07:16 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 05:07:16 [block_pool.py:292] Successfully reset prefix cache
+
+ 12%|█▏        | 118/1024 [4:56:16<40:58:08, 162.79s/it][A
+                                                        [A{'loss': 0.0194, 'grad_norm': 0.0032127038575708866, 'learning_rate': 1e-05, 'num_tokens': 88077385.0, 'completions/mean_length': 5093.859375, 'completions/min_length': 364.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 4914.65087890625, 'completions/min_terminated_length': 364.0, 'completions/max_terminated_length': 15416.0, 'rewards/accuracy_reward/mean': 0.421875, 'rewards/accuracy_reward/std': 0.4957992732524872, 'reward': 0.421875, 'reward_std': 0.345874547958374, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.020314980298280716, 'sampling/sampling_logp_difference/max': 9.562187194824219, 'sampling/importance_sampling_ratio/min': 7.033879228401929e-05, 'sampling/importance_sampling_ratio/mean': 0.9999171495437622, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.1065888702869415, 'clip_ratio/low_mean': 4.26799579145154e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 1.1877163728968299e-05, 'clip_ratio/high_max': 3.987745776612428e-05, 'clip_ratio/region_mean': 5.455712096136267e-05, 'epoch': 0.11}
+
+ 12%|█▏        | 118/1024 [4:56:16<40:58:08, 162.79s/it][AINFO 12-02 05:09:49 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 05:09:49 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 05:09:49 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 05:09:49 [block_pool.py:292] Successfully reset prefix cache
+
+ 12%|█▏        | 119/1024 [4:59:05<41:25:52, 164.81s/it][A
+                                                        [A{'loss': 0.0862, 'grad_norm': 0.0028925195802003145, 'learning_rate': 1e-05, 'num_tokens': 88985269.0, 'completions/mean_length': 6943.53125, 'completions/min_length': 307.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 6639.0, 'completions/min_terminated_length': 307.0, 'completions/max_terminated_length': 15740.0, 'rewards/accuracy_reward/mean': 0.3984375, 'rewards/accuracy_reward/std': 0.4915000796318054, 'reward': 0.3984375, 'reward_std': 0.3535328209400177, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.019378282129764557, 'sampling/sampling_logp_difference/max': 16.540752410888672, 'sampling/importance_sampling_ratio/min': 6.553035092338177e-08, 'sampling/importance_sampling_ratio/mean': 0.999980628490448, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9009081721305847, 'clip_ratio/low_mean': 3.470697703278347e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 4.4788730519940145e-06, 'clip_ratio/high_max': 9.35208754526684e-06, 'clip_ratio/region_mean': 3.918584917528278e-05, 'epoch': 0.11}
+
+ 12%|█▏        | 119/1024 [4:59:05<41:25:52, 164.81s/it][AINFO 12-02 05:12:38 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 05:12:38 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 05:12:38 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 05:12:38 [block_pool.py:292] Successfully reset prefix cache
+
+ 12%|█▏        | 120/1024 [5:01:21<39:12:56, 156.17s/it][A
+                                                        [A{'loss': 0.0215, 'grad_norm': 0.0017386430408805609, 'learning_rate': 1e-05, 'num_tokens': 89645205.0, 'completions/mean_length': 4976.25, 'completions/min_length': 702.0, 'completions/max_length': 15148.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 4976.25, 'completions/min_terminated_length': 702.0, 'completions/max_terminated_length': 15148.0, 'rewards/accuracy_reward/mean': 0.359375, 'rewards/accuracy_reward/std': 0.481702595949173, 'reward': 0.359375, 'reward_std': 0.26462042331695557, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.018035830929875374, 'sampling/sampling_logp_difference/max': 11.74996566772461, 'sampling/importance_sampling_ratio/min': 7.889595508459024e-06, 'sampling/importance_sampling_ratio/mean': 0.9999554753303528, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9463540017604828, 'clip_ratio/low_mean': 2.4615862798782473e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 2.734990403041593e-06, 'clip_ratio/high_max': 1.0939961612166371e-05, 'clip_ratio/region_mean': 2.7350853201824066e-05, 'epoch': 0.11}
+
+ 12%|█▏        | 120/1024 [5:01:21<39:12:56, 156.17s/it][AINFO 12-02 05:14:54 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 05:14:54 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 05:14:54 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 05:14:54 [block_pool.py:292] Successfully reset prefix cache
+
+ 12%|█▏        | 121/1024 [5:03:53<38:50:19, 154.84s/it][A
+                                                        [A{'loss': 0.0666, 'grad_norm': 0.0037381781730800867, 'learning_rate': 1e-05, 'num_tokens': 90489394.0, 'completions/mean_length': 6439.5390625, 'completions/min_length': 959.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 6281.69091796875, 'completions/min_terminated_length': 959.0, 'completions/max_terminated_length': 15892.0, 'rewards/accuracy_reward/mean': 0.3203125, 'rewards/accuracy_reward/std': 0.4684300124645233, 'reward': 0.3203125, 'reward_std': 0.2624938488006592, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.019368179142475128, 'sampling/sampling_logp_difference/max': 5.62494421005249, 'sampling/importance_sampling_ratio/min': 0.003606764366850257, 'sampling/importance_sampling_ratio/mean': 0.9999206066131592, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.899876207113266, 'clip_ratio/low_mean': 2.6826061798601586e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 1.485407324253174e-06, 'clip_ratio/high_max': 5.941629297012696e-06, 'clip_ratio/region_mean': 2.8311469009167922e-05, 'epoch': 0.11}
+
+ 12%|█▏        | 121/1024 [5:03:53<38:50:19, 154.84s/it][AINFO 12-02 05:17:26 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 05:17:26 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 05:17:26 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 05:17:26 [block_pool.py:292] Successfully reset prefix cache
+
+ 12%|█▏        | 122/1024 [5:06:56<40:52:09, 163.11s/it][A
+                                                        [A{'loss': 0.095, 'grad_norm': 0.0018562980694696307, 'learning_rate': 1e-05, 'num_tokens': 91390054.0, 'completions/mean_length': 6876.46875, 'completions/min_length': 4.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.046875, 'completions/mean_terminated_length': 6408.884765625, 'completions/min_terminated_length': 4.0, 'completions/max_terminated_length': 15979.0, 'rewards/accuracy_reward/mean': 0.21875, 'rewards/accuracy_reward/std': 0.41502299904823303, 'reward': 0.21875, 'reward_std': 0.29955869913101196, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.020825792104005814, 'sampling/sampling_logp_difference/max': 10.436432838439941, 'sampling/importance_sampling_ratio/min': 2.9343695132411085e-05, 'sampling/importance_sampling_ratio/mean': 0.9999849796295166, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.1018569767475128, 'clip_ratio/low_mean': 3.058137212974543e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 1.2974880974070402e-06, 'clip_ratio/high_max': 5.189952389628161e-06, 'clip_ratio/region_mean': 3.187886022715247e-05, 'epoch': 0.11}
+
+ 12%|█▏        | 122/1024 [5:06:56<40:52:09, 163.11s/it][AINFO 12-02 05:20:28 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 05:20:28 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 05:20:28 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 05:20:28 [block_pool.py:292] Successfully reset prefix cache
+
+ 12%|█▏        | 123/1024 [5:09:40<40:57:14, 163.63s/it][A
+                                                        [A{'loss': 0.0552, 'grad_norm': 0.0016695430967956781, 'learning_rate': 1e-05, 'num_tokens': 92241535.0, 'completions/mean_length': 6501.5078125, 'completions/min_length': 720.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 6344.64306640625, 'completions/min_terminated_length': 720.0, 'completions/max_terminated_length': 14153.0, 'rewards/accuracy_reward/mean': 0.2734375, 'rewards/accuracy_reward/std': 0.447474867105484, 'reward': 0.2734375, 'reward_std': 0.28641316294670105, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.020438479259610176, 'sampling/sampling_logp_difference/max': 8.343140602111816, 'sampling/importance_sampling_ratio/min': 0.0002380236255703494, 'sampling/importance_sampling_ratio/mean': 0.9998984336853027, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.073579266667366, 'clip_ratio/low_mean': 3.029032552603894e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 5.055208589510585e-06, 'clip_ratio/high_max': 2.022083435804234e-05, 'clip_ratio/region_mean': 3.53455343429232e-05, 'epoch': 0.11}
+
+ 12%|█▏        | 123/1024 [5:09:40<40:57:14, 163.63s/it][AINFO 12-02 05:23:13 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 05:23:13 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 05:23:13 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 05:23:13 [block_pool.py:292] Successfully reset prefix cache
+
+ 12%|█▏        | 124/1024 [5:11:58<38:58:48, 155.92s/it][A
+                                                        [A{'loss': 0.0, 'grad_norm': 0.0034909825772047043, 'learning_rate': 1e-05, 'num_tokens': 92962472.0, 'completions/mean_length': 5474.1328125, 'completions/min_length': 14.0, 'completions/max_length': 14345.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 5474.1328125, 'completions/min_terminated_length': 14.0, 'completions/max_terminated_length': 14345.0, 'rewards/accuracy_reward/mean': 0.3046875, 'rewards/accuracy_reward/std': 0.46208351850509644, 'reward': 0.3046875, 'reward_std': 0.27564430236816406, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.019930578768253326, 'sampling/sampling_logp_difference/max': 6.328239917755127, 'sampling/importance_sampling_ratio/min': 0.0017851731972768903, 'sampling/importance_sampling_ratio/mean': 1.000006079673767, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.0692576617002487, 'clip_ratio/low_mean': 2.2190370486896427e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 8.477795176986547e-07, 'clip_ratio/high_max': 3.3911180707946187e-06, 'clip_ratio/region_mean': 2.30381500614385e-05, 'epoch': 0.11}
+
+ 12%|█▏        | 124/1024 [5:11:58<38:58:48, 155.92s/it][AINFO 12-02 05:25:31 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 05:25:31 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 05:25:31 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 05:25:31 [block_pool.py:292] Successfully reset prefix cache
+
+ 12%|█▏        | 125/1024 [5:15:00<40:52:25, 163.68s/it][A
+                                                        [A{'loss': 0.0454, 'grad_norm': 0.0023449272848665714, 'learning_rate': 1e-05, 'num_tokens': 93950506.0, 'completions/mean_length': 7512.078125, 'completions/min_length': 486.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 7225.88671875, 'completions/min_terminated_length': 486.0, 'completions/max_terminated_length': 16198.0, 'rewards/accuracy_reward/mean': 0.3203125, 'rewards/accuracy_reward/std': 0.4684300124645233, 'reward': 0.3203125, 'reward_std': 0.22461043298244476, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.020141655579209328, 'sampling/sampling_logp_difference/max': 6.412672996520996, 'sampling/importance_sampling_ratio/min': 0.0016406332142651081, 'sampling/importance_sampling_ratio/mean': 0.9999359250068665, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9676955863833427, 'clip_ratio/low_mean': 4.615546390596137e-05, 'clip_ratio/low_min': 1.366510537081922e-05, 'clip_ratio/high_mean': 7.620442374900449e-06, 'clip_ratio/high_max': 2.6292200345778838e-05, 'clip_ratio/region_mean': 5.3775906508235494e-05, 'epoch': 0.11}
+
+ 12%|█▏        | 125/1024 [5:15:00<40:52:25, 163.68s/it][AINFO 12-02 05:28:33 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 05:28:33 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 05:28:33 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 05:28:33 [block_pool.py:292] Successfully reset prefix cache
+
+ 12%|█▏        | 126/1024 [5:17:49<41:14:11, 165.31s/it][A
+                                                        [A{'loss': 0.0617, 'grad_norm': 0.0037103090435266495, 'learning_rate': 1e-05, 'num_tokens': 94854016.0, 'completions/mean_length': 6920.484375, 'completions/min_length': 962.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 6693.3603515625, 'completions/min_terminated_length': 962.0, 'completions/max_terminated_length': 16183.0, 'rewards/accuracy_reward/mean': 0.4375, 'rewards/accuracy_reward/std': 0.49802759289741516, 'reward': 0.4375, 'reward_std': 0.322716623544693, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.01915796287357807, 'sampling/sampling_logp_difference/max': 7.648271083831787, 'sampling/importance_sampling_ratio/min': 0.00047686786274425685, 'sampling/importance_sampling_ratio/mean': 0.9999761581420898, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8662540689110756, 'clip_ratio/low_mean': 3.3802551342887455e-05, 'clip_ratio/low_min': 4.146762421441963e-06, 'clip_ratio/high_mean': 1.2743139450321905e-06, 'clip_ratio/high_max': 5.097255780128762e-06, 'clip_ratio/region_mean': 3.5076865287919645e-05, 'epoch': 0.12}
+
+ 12%|█▏        | 126/1024 [5:17:49<41:14:11, 165.31s/it][AINFO 12-02 05:31:22 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 05:31:22 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 05:31:22 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 05:31:22 [block_pool.py:292] Successfully reset prefix cache
+
+ 12%|█▏        | 127/1024 [5:20:42<41:43:08, 167.43s/it][A
+                                                        [A{'loss': 0.0788, 'grad_norm': 0.0024642283096909523, 'learning_rate': 1e-05, 'num_tokens': 95889966.0, 'completions/mean_length': 7939.609375, 'completions/min_length': 1260.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 7805.57177734375, 'completions/min_terminated_length': 1260.0, 'completions/max_terminated_length': 15867.0, 'rewards/accuracy_reward/mean': 0.2265625, 'rewards/accuracy_reward/std': 0.4202519655227661, 'reward': 0.2265625, 'reward_std': 0.27434611320495605, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.020453302189707756, 'sampling/sampling_logp_difference/max': 9.999995231628418, 'sampling/importance_sampling_ratio/min': 4.540014560916461e-05, 'sampling/importance_sampling_ratio/mean': 0.9998771548271179, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9707008600234985, 'clip_ratio/low_mean': 5.024227584726759e-05, 'clip_ratio/low_min': 1.3627016414829995e-05, 'clip_ratio/high_mean': 2.123060994563275e-06, 'clip_ratio/high_max': 8.4922439782531e-06, 'clip_ratio/region_mean': 5.236533706920454e-05, 'epoch': 0.12}
+
+ 12%|█▏        | 127/1024 [5:20:42<41:43:08, 167.43s/it][AINFO 12-02 05:34:15 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 05:34:15 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 05:34:15 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 05:34:15 [block_pool.py:292] Successfully reset prefix cache
+
+ 12%|█▎        | 128/1024 [5:23:20<41:00:51, 164.79s/it][A
+                                                        [A{'loss': 0.0537, 'grad_norm': 0.003317479742690921, 'learning_rate': 1e-05, 'num_tokens': 96676847.0, 'completions/mean_length': 5985.8203125, 'completions/min_length': 315.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.046875, 'completions/mean_terminated_length': 5474.43408203125, 'completions/min_terminated_length': 315.0, 'completions/max_terminated_length': 14969.0, 'rewards/accuracy_reward/mean': 0.3671875, 'rewards/accuracy_reward/std': 0.4839322865009308, 'reward': 0.3671875, 'reward_std': 0.287486732006073, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.01996719278395176, 'sampling/sampling_logp_difference/max': 8.156899452209473, 'sampling/importance_sampling_ratio/min': 0.000286750087980181, 'sampling/importance_sampling_ratio/mean': 0.9999130964279175, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9083090648055077, 'clip_ratio/low_mean': 3.766829564710861e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 0.0, 'clip_ratio/high_max': 0.0, 'clip_ratio/region_mean': 3.766829564710861e-05, 'epoch': 0.12}
+
+ 12%|█▎        | 128/1024 [5:23:20<41:00:51, 164.79s/it][AINFO 12-02 05:36:59 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 05:36:59 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 05:36:59 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 05:36:59 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 13%|█▎        | 129/1024 [5:26:15<41:41:47, 167.72s/it][A
+                                                        [A{'loss': 0.0622, 'grad_norm': 0.0019073591101914644, 'learning_rate': 1e-05, 'num_tokens': 97539453.0, 'completions/mean_length': 6565.359375, 'completions/min_length': 248.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 6488.04736328125, 'completions/min_terminated_length': 248.0, 'completions/max_terminated_length': 15081.0, 'rewards/accuracy_reward/mean': 0.2734375, 'rewards/accuracy_reward/std': 0.447474867105484, 'reward': 0.2734375, 'reward_std': 0.307217001914978, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.02150837704539299, 'sampling/sampling_logp_difference/max': 7.414913177490234, 'sampling/importance_sampling_ratio/min': 0.0006022047018632293, 'sampling/importance_sampling_ratio/mean': 0.9999555945396423, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.1013468354940414, 'clip_ratio/low_mean': 5.708034223061986e-05, 'clip_ratio/low_min': 2.75287948170444e-06, 'clip_ratio/high_mean': 4.609963411894569e-06, 'clip_ratio/high_max': 1.8439853647578275e-05, 'clip_ratio/region_mean': 6.169030598357494e-05, 'epoch': 0.12}
+
+ 13%|█▎        | 129/1024 [5:26:15<41:41:47, 167.72s/it][AINFO 12-02 05:39:48 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 05:39:48 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 05:39:48 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 05:39:48 [block_pool.py:292] Successfully reset prefix cache
+
+ 13%|█▎        | 130/1024 [5:29:07<41:58:51, 169.05s/it][A
+                                                        [A{'loss': 0.0249, 'grad_norm': 0.0010163087863475084, 'learning_rate': 1e-05, 'num_tokens': 98429036.0, 'completions/mean_length': 6779.6171875, 'completions/min_length': 4.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 6703.9921875, 'completions/min_terminated_length': 4.0, 'completions/max_terminated_length': 16099.0, 'rewards/accuracy_reward/mean': 0.453125, 'rewards/accuracy_reward/std': 0.4997538626194, 'reward': 0.453125, 'reward_std': 0.22567126154899597, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.018716152757406235, 'sampling/sampling_logp_difference/max': 17.178054809570312, 'sampling/importance_sampling_ratio/min': 3.464699460664633e-08, 'sampling/importance_sampling_ratio/mean': 0.9999485015869141, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8940552547574043, 'clip_ratio/low_mean': 1.9822365402433206e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 2.2671213173452998e-06, 'clip_ratio/high_max': 9.068485269381199e-06, 'clip_ratio/region_mean': 2.208948649240483e-05, 'epoch': 0.12}
+
+ 13%|█▎        | 130/1024 [5:29:07<41:58:51, 169.05s/it][AINFO 12-02 05:42:40 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 05:42:40 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 05:42:40 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 05:42:40 [block_pool.py:292] Successfully reset prefix cache
+
+ 13%|█▎        | 131/1024 [5:31:27<39:45:25, 160.28s/it][A
+                                                        [A{'loss': 0.0238, 'grad_norm': 0.002155766822397709, 'learning_rate': 1e-05, 'num_tokens': 99184264.0, 'completions/mean_length': 5766.71875, 'completions/min_length': 47.0, 'completions/max_length': 14549.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 5766.71875, 'completions/min_terminated_length': 47.0, 'completions/max_terminated_length': 14549.0, 'rewards/accuracy_reward/mean': 0.4140625, 'rewards/accuracy_reward/std': 0.49449479579925537, 'reward': 0.4140625, 'reward_std': 0.3077537715435028, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.020948775112628937, 'sampling/sampling_logp_difference/max': 9.133563995361328, 'sampling/importance_sampling_ratio/min': 0.00010798005678225309, 'sampling/importance_sampling_ratio/mean': 0.9999253749847412, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.0455922111868858, 'clip_ratio/low_mean': 2.9014110396019532e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 1.2618105529327295e-06, 'clip_ratio/high_max': 5.047242211730918e-06, 'clip_ratio/region_mean': 3.0275920835265424e-05, 'epoch': 0.12}
+
+ 13%|█▎        | 131/1024 [5:31:27<39:45:25, 160.28s/it][AINFO 12-02 05:45:00 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 05:45:00 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 05:45:00 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 05:45:00 [block_pool.py:292] Successfully reset prefix cache
+
+ 13%|█▎        | 132/1024 [5:34:12<40:04:28, 161.74s/it][A
+                                                        [A{'loss': -0.0279, 'grad_norm': 0.0021464223973453045, 'learning_rate': 1e-05, 'num_tokens': 99996831.0, 'completions/mean_length': 6167.2421875, 'completions/min_length': 218.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 6005.07177734375, 'completions/min_terminated_length': 218.0, 'completions/max_terminated_length': 14704.0, 'rewards/accuracy_reward/mean': 0.421875, 'rewards/accuracy_reward/std': 0.4957992732524872, 'reward': 0.421875, 'reward_std': 0.3916535973548889, 'frac_reward_zero_std': 0.1875, 'sampling/sampling_logp_difference/mean': 0.01866895705461502, 'sampling/sampling_logp_difference/max': 3.794421911239624, 'sampling/importance_sampling_ratio/min': 0.02249590866267681, 'sampling/importance_sampling_ratio/mean': 0.9999240040779114, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9100174158811569, 'clip_ratio/low_mean': 4.496008500609605e-05, 'clip_ratio/low_min': 7.757854064038838e-06, 'clip_ratio/high_mean': 6.505383225885453e-06, 'clip_ratio/high_max': 2.0882574972347356e-05, 'clip_ratio/region_mean': 5.1465468231981504e-05, 'epoch': 0.12}
+
+ 13%|█▎        | 132/1024 [5:34:12<40:04:28, 161.74s/it][AINFO 12-02 05:47:45 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 05:47:45 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 05:47:45 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 05:47:45 [block_pool.py:292] Successfully reset prefix cache
+
+ 13%|█▎        | 133/1024 [5:36:42<39:11:04, 158.32s/it][A
+                                                        [A{'loss': 0.0524, 'grad_norm': 0.0023277695290744305, 'learning_rate': 1e-05, 'num_tokens': 100814112.0, 'completions/mean_length': 6242.9453125, 'completions/min_length': 1187.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 6163.09423828125, 'completions/min_terminated_length': 1187.0, 'completions/max_terminated_length': 15738.0, 'rewards/accuracy_reward/mean': 0.3984375, 'rewards/accuracy_reward/std': 0.4915000796318054, 'reward': 0.3984375, 'reward_std': 0.2869499623775482, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.0191188994795084, 'sampling/sampling_logp_difference/max': 8.33753490447998, 'sampling/importance_sampling_ratio/min': 0.0002393616596236825, 'sampling/importance_sampling_ratio/mean': 0.999959409236908, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8624134212732315, 'clip_ratio/low_mean': 3.0998018473837874e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 0.0, 'clip_ratio/high_max': 0.0, 'clip_ratio/region_mean': 3.0998018473837874e-05, 'epoch': 0.12}
+
+ 13%|█▎        | 133/1024 [5:36:42<39:11:04, 158.32s/it][AINFO 12-02 05:50:15 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 05:50:15 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 05:50:15 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 05:50:15 [block_pool.py:292] Successfully reset prefix cache
+
+ 13%|█▎        | 134/1024 [5:39:25<39:28:16, 159.66s/it][A
+                                                        [A{'loss': 0.0321, 'grad_norm': 0.003547821193933487, 'learning_rate': 1e-05, 'num_tokens': 101566264.0, 'completions/mean_length': 5733.6875, 'completions/min_length': 789.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 5478.080078125, 'completions/min_terminated_length': 789.0, 'completions/max_terminated_length': 14866.0, 'rewards/accuracy_reward/mean': 0.3984375, 'rewards/accuracy_reward/std': 0.4915000796318054, 'reward': 0.3984375, 'reward_std': 0.36584997177124023, 'frac_reward_zero_std': 0.1875, 'sampling/sampling_logp_difference/mean': 0.019646761938929558, 'sampling/sampling_logp_difference/max': 8.961891174316406, 'sampling/importance_sampling_ratio/min': 0.0001282035664189607, 'sampling/importance_sampling_ratio/mean': 0.9999400973320007, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9628067463636398, 'clip_ratio/low_mean': 4.329304238126497e-05, 'clip_ratio/low_min': 3.5120251595799346e-06, 'clip_ratio/high_mean': 1.647468138799013e-06, 'clip_ratio/high_max': 6.589872555196052e-06, 'clip_ratio/region_mean': 4.494051017900347e-05, 'epoch': 0.12}
+
+ 13%|█▎        | 134/1024 [5:39:25<39:28:16, 159.66s/it][AINFO 12-02 05:52:58 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 05:52:58 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 05:52:58 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 05:52:58 [block_pool.py:292] Successfully reset prefix cache
+
+ 13%|█▎        | 135/1024 [5:41:39<37:32:50, 152.05s/it][A
+                                                        [A{'loss': 0.1406, 'grad_norm': 0.0024891747161746025, 'learning_rate': 1e-05, 'num_tokens': 102291456.0, 'completions/mean_length': 5505.9375, 'completions/min_length': 668.0, 'completions/max_length': 15848.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 5505.9375, 'completions/min_terminated_length': 668.0, 'completions/max_terminated_length': 15848.0, 'rewards/accuracy_reward/mean': 0.5, 'rewards/accuracy_reward/std': 0.5019646286964417, 'reward': 0.5, 'reward_std': 0.35482609272003174, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.01716250739991665, 'sampling/sampling_logp_difference/max': 6.527429103851318, 'sampling/importance_sampling_ratio/min': 0.0014627616619691253, 'sampling/importance_sampling_ratio/mean': 0.9999248385429382, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8041045889258385, 'clip_ratio/low_mean': 3.014796902789385e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 4.2768818957483745e-06, 'clip_ratio/high_max': 1.7107527582993498e-05, 'clip_ratio/region_mean': 3.442485103732906e-05, 'epoch': 0.12}
+
+ 13%|█▎        | 135/1024 [5:41:39<37:32:50, 152.05s/it][AINFO 12-02 05:55:12 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 05:55:12 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 05:55:12 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 05:55:12 [block_pool.py:292] Successfully reset prefix cache
+
+ 13%|█▎        | 136/1024 [5:44:11<37:28:59, 151.96s/it][A
+                                                        [A{'loss': 0.1361, 'grad_norm': 0.00220683915540576, 'learning_rate': 1e-05, 'num_tokens': 102949824.0, 'completions/mean_length': 5003.0625, 'completions/min_length': 497.0, 'completions/max_length': 16005.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 5003.0625, 'completions/min_terminated_length': 497.0, 'completions/max_terminated_length': 16005.0, 'rewards/accuracy_reward/mean': 0.4140625, 'rewards/accuracy_reward/std': 0.49449479579925537, 'reward': 0.4140625, 'reward_std': 0.3158818483352661, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.018076512962579727, 'sampling/sampling_logp_difference/max': 9.393817901611328, 'sampling/importance_sampling_ratio/min': 8.323705696966499e-05, 'sampling/importance_sampling_ratio/mean': 0.999973714351654, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9115714654326439, 'clip_ratio/low_mean': 5.380711581892683e-05, 'clip_ratio/low_min': 4.5777483137499075e-06, 'clip_ratio/high_mean': 3.872257764214737e-06, 'clip_ratio/high_max': 1.548903105685895e-05, 'clip_ratio/region_mean': 5.767937363998499e-05, 'epoch': 0.13}
+
+ 13%|█▎        | 136/1024 [5:44:11<37:28:59, 151.96s/it][AINFO 12-02 05:57:44 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 05:57:44 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 05:57:44 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 05:57:44 [block_pool.py:292] Successfully reset prefix cache
+
+ 13%|█▎        | 137/1024 [5:46:31<36:33:54, 148.40s/it][A
+                                                        [A{'loss': 0.0215, 'grad_norm': 0.003964806906878948, 'learning_rate': 1e-05, 'num_tokens': 103580913.0, 'completions/mean_length': 4742.1328125, 'completions/min_length': 462.0, 'completions/max_length': 15658.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 4742.1328125, 'completions/min_terminated_length': 462.0, 'completions/max_terminated_length': 15658.0, 'rewards/accuracy_reward/mean': 0.4609375, 'rewards/accuracy_reward/std': 0.5004304051399231, 'reward': 0.4609375, 'reward_std': 0.2914257347583771, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.019651200622320175, 'sampling/sampling_logp_difference/max': 9.56246280670166, 'sampling/importance_sampling_ratio/min': 7.031940185697749e-05, 'sampling/importance_sampling_ratio/mean': 0.999952495098114, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9430246204137802, 'clip_ratio/low_mean': 3.4416837252138066e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 5.4528402415598975e-06, 'clip_ratio/high_max': 2.181136096623959e-05, 'clip_ratio/region_mean': 3.986967681157694e-05, 'epoch': 0.13}
+
+ 13%|█▎        | 137/1024 [5:46:31<36:33:54, 148.40s/it][AINFO 12-02 06:00:04 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 06:00:04 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 06:00:04 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 06:00:04 [block_pool.py:292] Successfully reset prefix cache
+
+ 13%|█▎        | 138/1024 [5:49:33<39:01:24, 158.56s/it][A
+                                                        [A{'loss': 0.0377, 'grad_norm': 0.002067410387098789, 'learning_rate': 1e-05, 'num_tokens': 104447463.0, 'completions/mean_length': 6574.171875, 'completions/min_length': 10.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.046875, 'completions/mean_terminated_length': 6091.72119140625, 'completions/min_terminated_length': 10.0, 'completions/max_terminated_length': 15944.0, 'rewards/accuracy_reward/mean': 0.3125, 'rewards/accuracy_reward/std': 0.4653336703777313, 'reward': 0.3125, 'reward_std': 0.24511480331420898, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.018853647634387016, 'sampling/sampling_logp_difference/max': 8.456169128417969, 'sampling/importance_sampling_ratio/min': 0.00021258489869069308, 'sampling/importance_sampling_ratio/mean': 0.9997583627700806, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8429529070854187, 'clip_ratio/low_mean': 3.8682398553646635e-05, 'clip_ratio/low_min': 8.189203072106466e-06, 'clip_ratio/high_mean': 1.019210117192415e-06, 'clip_ratio/high_max': 4.07684046876966e-06, 'clip_ratio/region_mean': 3.970160832977854e-05, 'epoch': 0.13}
+
+ 13%|█▎        | 138/1024 [5:49:33<39:01:24, 158.56s/it][AINFO 12-02 06:03:06 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 06:03:06 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 06:03:06 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 06:03:06 [block_pool.py:292] Successfully reset prefix cache
+
+ 14%|█▎        | 139/1024 [5:51:53<37:32:33, 152.72s/it][A
+                                                        [A{'loss': 0.0946, 'grad_norm': 0.004292502999305725, 'learning_rate': 1e-05, 'num_tokens': 105052287.0, 'completions/mean_length': 4581.5625, 'completions/min_length': 301.0, 'completions/max_length': 15518.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 4581.5625, 'completions/min_terminated_length': 301.0, 'completions/max_terminated_length': 15518.0, 'rewards/accuracy_reward/mean': 0.625, 'rewards/accuracy_reward/std': 0.4860251843929291, 'reward': 0.625, 'reward_std': 0.3908300995826721, 'frac_reward_zero_std': 0.1875, 'sampling/sampling_logp_difference/mean': 0.016310662031173706, 'sampling/sampling_logp_difference/max': 6.24802827835083, 'sampling/importance_sampling_ratio/min': 0.0019342642044648528, 'sampling/importance_sampling_ratio/mean': 0.9999477863311768, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.7094272822141647, 'clip_ratio/low_mean': 5.9263072444082354e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 4.931455805490259e-06, 'clip_ratio/high_max': 1.9725823221961036e-05, 'clip_ratio/region_mean': 6.419452870431996e-05, 'epoch': 0.13}
+
+ 14%|█▎        | 139/1024 [5:51:53<37:32:33, 152.72s/it][AINFO 12-02 06:05:25 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 06:05:25 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 06:05:25 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 06:05:25 [block_pool.py:292] Successfully reset prefix cache
+
+ 14%|█▎        | 140/1024 [5:54:54<39:37:56, 161.40s/it][A
+                                                        [A{'loss': 0.0485, 'grad_norm': 0.0015785128343850374, 'learning_rate': 1e-05, 'num_tokens': 105977048.0, 'completions/mean_length': 7066.4453125, 'completions/min_length': 990.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 6918.5478515625, 'completions/min_terminated_length': 990.0, 'completions/max_terminated_length': 16097.0, 'rewards/accuracy_reward/mean': 0.3515625, 'rewards/accuracy_reward/std': 0.4793342351913452, 'reward': 0.3515625, 'reward_std': 0.27328038215637207, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.018304405733942986, 'sampling/sampling_logp_difference/max': 6.8676018714904785, 'sampling/importance_sampling_ratio/min': 0.00104097044095397, 'sampling/importance_sampling_ratio/mean': 1.0000159740447998, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8481669947504997, 'clip_ratio/low_mean': 4.6397121650443296e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 2.5330745074825245e-06, 'clip_ratio/high_max': 1.0132298029930098e-05, 'clip_ratio/region_mean': 4.893019581686531e-05, 'epoch': 0.13}
+
+ 14%|█▎        | 140/1024 [5:54:54<39:37:56, 161.40s/it][AINFO 12-02 06:08:27 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 06:08:27 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 06:08:27 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 06:08:27 [block_pool.py:292] Successfully reset prefix cache
+
+ 14%|█▍        | 141/1024 [5:57:29<39:06:37, 159.45s/it][A
+                                                        [A{'loss': 0.0725, 'grad_norm': 0.002542720176279545, 'learning_rate': 1e-05, 'num_tokens': 106793187.0, 'completions/mean_length': 6230.5234375, 'completions/min_length': 220.0, 'completions/max_length': 16104.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 6230.5234375, 'completions/min_terminated_length': 220.0, 'completions/max_terminated_length': 16104.0, 'rewards/accuracy_reward/mean': 0.3203125, 'rewards/accuracy_reward/std': 0.4684300124645233, 'reward': 0.3203125, 'reward_std': 0.3050953149795532, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.019391046836972237, 'sampling/sampling_logp_difference/max': 8.187352180480957, 'sampling/importance_sampling_ratio/min': 0.0002781494113150984, 'sampling/importance_sampling_ratio/mean': 1.0000169277191162, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9658062160015106, 'clip_ratio/low_mean': 2.3075059743860038e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 4.2472559016459854e-06, 'clip_ratio/high_max': 1.6989023606583942e-05, 'clip_ratio/region_mean': 2.7322315418132348e-05, 'epoch': 0.13}
+
+ 14%|█▍        | 141/1024 [5:57:29<39:06:37, 159.45s/it][AINFO 12-02 06:11:02 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 06:11:02 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 06:11:02 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 06:11:02 [block_pool.py:292] Successfully reset prefix cache
+
+ 14%|█▍        | 142/1024 [5:59:54<38:00:57, 155.17s/it][A
+                                                        [A{'loss': 0.0444, 'grad_norm': 0.001451602904126048, 'learning_rate': 1e-05, 'num_tokens': 107539874.0, 'completions/mean_length': 5690.5546875, 'completions/min_length': 1124.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 5606.3544921875, 'completions/min_terminated_length': 1124.0, 'completions/max_terminated_length': 14216.0, 'rewards/accuracy_reward/mean': 0.4296875, 'rewards/accuracy_reward/std': 0.4969765841960907, 'reward': 0.4296875, 'reward_std': 0.23304283618927002, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.018607191741466522, 'sampling/sampling_logp_difference/max': 18.993377685546875, 'sampling/importance_sampling_ratio/min': 5.640022671116185e-09, 'sampling/importance_sampling_ratio/mean': 0.9999307990074158, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.0098655670881271, 'clip_ratio/low_mean': 2.7597974508353218e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 0.0, 'clip_ratio/high_max': 0.0, 'clip_ratio/region_mean': 2.7597974508353218e-05, 'epoch': 0.13}
+
+ 14%|█▍        | 142/1024 [5:59:54<38:00:57, 155.17s/it][AINFO 12-02 06:13:27 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 06:13:27 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 06:13:27 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 06:13:27 [block_pool.py:292] Successfully reset prefix cache
+
+ 14%|█▍        | 143/1024 [6:02:10<36:31:38, 149.26s/it][A
+                                                        [A{'loss': 0.0411, 'grad_norm': 0.0023549250327050686, 'learning_rate': 1e-05, 'num_tokens': 108260091.0, 'completions/mean_length': 5471.1328125, 'completions/min_length': 4.0, 'completions/max_length': 15791.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 5471.1328125, 'completions/min_terminated_length': 4.0, 'completions/max_terminated_length': 15791.0, 'rewards/accuracy_reward/mean': 0.3203125, 'rewards/accuracy_reward/std': 0.4684300124645233, 'reward': 0.3203125, 'reward_std': 0.30061954259872437, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.020427243784070015, 'sampling/sampling_logp_difference/max': 6.749967098236084, 'sampling/importance_sampling_ratio/min': 0.0011709182290360332, 'sampling/importance_sampling_ratio/mean': 0.9999832510948181, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.0413162112236023, 'clip_ratio/low_mean': 2.350350996493944e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 4.19954119479371e-06, 'clip_ratio/high_max': 1.2800467629858758e-05, 'clip_ratio/region_mean': 2.770305115973315e-05, 'epoch': 0.13}
+
+ 14%|█▍        | 143/1024 [6:02:10<36:31:38, 149.26s/it][AINFO 12-02 06:15:43 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 06:15:43 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 06:15:43 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 06:15:43 [block_pool.py:292] Successfully reset prefix cache
+
+ 14%|█▍        | 144/1024 [6:04:43<36:45:05, 150.35s/it][A
+                                                        [A{'loss': 0.0686, 'grad_norm': 0.0017527056625112891, 'learning_rate': 1e-05, 'num_tokens': 109073890.0, 'completions/mean_length': 6211.7421875, 'completions/min_length': 622.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 6050.2783203125, 'completions/min_terminated_length': 622.0, 'completions/max_terminated_length': 16277.0, 'rewards/accuracy_reward/mean': 0.421875, 'rewards/accuracy_reward/std': 0.4957992732524872, 'reward': 0.421875, 'reward_std': 0.29826050996780396, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.018898162990808487, 'sampling/sampling_logp_difference/max': 5.843511581420898, 'sampling/importance_sampling_ratio/min': 0.002898645820096135, 'sampling/importance_sampling_ratio/mean': 0.9999092221260071, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9706784337759018, 'clip_ratio/low_mean': 4.361141452591255e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 5.495766231433663e-06, 'clip_ratio/high_max': 2.1983064925734652e-05, 'clip_ratio/region_mean': 4.9107180757346214e-05, 'epoch': 0.13}
+
+ 14%|█▍        | 144/1024 [6:04:43<36:45:05, 150.35s/it][AINFO 12-02 06:18:16 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 06:18:16 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 06:18:16 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 06:18:16 [block_pool.py:292] Successfully reset prefix cache
+
+ 14%|█▍        | 145/1024 [6:07:07<36:16:34, 148.57s/it][A
+                                                        [A{'loss': 0.011, 'grad_norm': 0.0007858420140109956, 'learning_rate': 1e-05, 'num_tokens': 109861813.0, 'completions/mean_length': 6007.8984375, 'completions/min_length': 156.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 5926.19677734375, 'completions/min_terminated_length': 156.0, 'completions/max_terminated_length': 14880.0, 'rewards/accuracy_reward/mean': 0.296875, 'rewards/accuracy_reward/std': 0.45867621898651123, 'reward': 0.296875, 'reward_std': 0.23486506938934326, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.021845955401659012, 'sampling/sampling_logp_difference/max': 17.22846221923828, 'sampling/importance_sampling_ratio/min': 3.294382011631569e-08, 'sampling/importance_sampling_ratio/mean': 0.9999340772628784, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.1967609524726868, 'clip_ratio/low_mean': 4.208964992358233e-05, 'clip_ratio/low_min': 3.9168990042526275e-06, 'clip_ratio/high_mean': 0.0, 'clip_ratio/high_max': 0.0, 'clip_ratio/region_mean': 4.208964992358233e-05, 'epoch': 0.13}
+
+ 14%|█▍        | 145/1024 [6:07:07<36:16:34, 148.57s/it][AINFO 12-02 06:20:40 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 06:20:40 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 06:20:40 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 06:20:40 [block_pool.py:292] Successfully reset prefix cache
+
+ 14%|█▍        | 146/1024 [6:09:48<37:09:55, 152.39s/it][A
+                                                        [A{'loss': 0.0405, 'grad_norm': 0.0029428249690681696, 'learning_rate': 1e-05, 'num_tokens': 110756572.0, 'completions/mean_length': 6800.9921875, 'completions/min_length': 5.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 6725.53564453125, 'completions/min_terminated_length': 5.0, 'completions/max_terminated_length': 15838.0, 'rewards/accuracy_reward/mean': 0.265625, 'rewards/accuracy_reward/std': 0.44340085983276367, 'reward': 0.265625, 'reward_std': 0.3248382806777954, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.02010120078921318, 'sampling/sampling_logp_difference/max': 7.365129470825195, 'sampling/importance_sampling_ratio/min': 0.0006329434108920395, 'sampling/importance_sampling_ratio/mean': 0.9999890327453613, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.0437887012958527, 'clip_ratio/low_mean': 3.749712686840212e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 1.1279552154519479e-06, 'clip_ratio/high_max': 4.5118208618077915e-06, 'clip_ratio/region_mean': 3.8625082197540905e-05, 'epoch': 0.13}
+
+ 14%|█▍        | 146/1024 [6:09:48<37:09:55, 152.39s/it][AINFO 12-02 06:23:21 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 06:23:21 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 06:23:21 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 06:23:21 [block_pool.py:292] Successfully reset prefix cache
+
+ 14%|█▍        | 147/1024 [6:12:31<37:53:30, 155.54s/it][A
+                                                        [A{'loss': 0.0316, 'grad_norm': 0.002088683657348156, 'learning_rate': 1e-05, 'num_tokens': 111585493.0, 'completions/mean_length': 6309.4453125, 'completions/min_length': 283.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 6230.1181640625, 'completions/min_terminated_length': 283.0, 'completions/max_terminated_length': 15834.0, 'rewards/accuracy_reward/mean': 0.375, 'rewards/accuracy_reward/std': 0.4860251843929291, 'reward': 0.375, 'reward_std': 0.39796435832977295, 'frac_reward_zero_std': 0.125, 'sampling/sampling_logp_difference/mean': 0.020927833393216133, 'sampling/sampling_logp_difference/max': 4.633236885070801, 'sampling/importance_sampling_ratio/min': 0.009723234921693802, 'sampling/importance_sampling_ratio/mean': 1.000007152557373, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9768906533718109, 'clip_ratio/low_mean': 4.964020990883e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 3.5692513051799324e-06, 'clip_ratio/high_max': 1.427700522071973e-05, 'clip_ratio/region_mean': 5.320946092979284e-05, 'epoch': 0.14}
+
+ 14%|█▍        | 147/1024 [6:12:31<37:53:30, 155.54s/it][AINFO 12-02 06:26:04 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 06:26:04 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 06:26:04 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 06:26:04 [block_pool.py:292] Successfully reset prefix cache
+
+ 14%|█▍        | 148/1024 [6:15:22<38:55:23, 159.96s/it][A
+                                                        [A{'loss': 0.1051, 'grad_norm': 0.002406956860795617, 'learning_rate': 1e-05, 'num_tokens': 112400363.0, 'completions/mean_length': 6221.859375, 'completions/min_length': 83.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 6060.5556640625, 'completions/min_terminated_length': 83.0, 'completions/max_terminated_length': 15209.0, 'rewards/accuracy_reward/mean': 0.40625, 'rewards/accuracy_reward/std': 0.4930621087551117, 'reward': 0.40625, 'reward_std': 0.31929677724838257, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.018652018159627914, 'sampling/sampling_logp_difference/max': 9.74976634979248, 'sampling/importance_sampling_ratio/min': 5.8308287407271564e-05, 'sampling/importance_sampling_ratio/mean': 0.9999701976776123, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9212924689054489, 'clip_ratio/low_mean': 5.122006064084417e-05, 'clip_ratio/low_min': 3.785125954891555e-06, 'clip_ratio/high_mean': 1.3710349549000966e-06, 'clip_ratio/high_max': 5.4841398196003865e-06, 'clip_ratio/region_mean': 5.25910957094311e-05, 'epoch': 0.14}
+
+ 14%|█▍        | 148/1024 [6:15:22<38:55:23, 159.96s/it][AINFO 12-02 06:28:54 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 06:28:54 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 06:28:54 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 06:28:54 [block_pool.py:292] Successfully reset prefix cache
+
+ 15%|█▍        | 149/1024 [6:18:13<39:42:32, 163.37s/it][A
+                                                        [A{'loss': 0.0625, 'grad_norm': 0.0022962254006415606, 'learning_rate': 1e-05, 'num_tokens': 113308748.0, 'completions/mean_length': 6942.8203125, 'completions/min_length': 200.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 6716.232421875, 'completions/min_terminated_length': 200.0, 'completions/max_terminated_length': 14997.0, 'rewards/accuracy_reward/mean': 0.375, 'rewards/accuracy_reward/std': 0.4860251843929291, 'reward': 0.375, 'reward_std': 0.3329663872718811, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.01939917355775833, 'sampling/sampling_logp_difference/max': 7.624979496002197, 'sampling/importance_sampling_ratio/min': 0.00048810525913722813, 'sampling/importance_sampling_ratio/mean': 0.9999334812164307, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.949538916349411, 'clip_ratio/low_mean': 3.999794398623635e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 3.6420379387891444e-06, 'clip_ratio/high_max': 1.4568151755156578e-05, 'clip_ratio/region_mean': 4.3639981413434725e-05, 'epoch': 0.14}
+
+ 15%|█▍        | 149/1024 [6:18:13<39:42:32, 163.37s/it][AINFO 12-02 06:31:46 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 06:31:46 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 06:31:46 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 06:31:46 [block_pool.py:292] Successfully reset prefix cache
+
+ 15%|█▍        | 150/1024 [6:21:04<40:12:20, 165.61s/it][A
+                                                        [A{'loss': 0.0473, 'grad_norm': 0.0020560629200190306, 'learning_rate': 1e-05, 'num_tokens': 114196235.0, 'completions/mean_length': 6783.1796875, 'completions/min_length': 7.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 6552.76025390625, 'completions/min_terminated_length': 7.0, 'completions/max_terminated_length': 15182.0, 'rewards/accuracy_reward/mean': 0.34375, 'rewards/accuracy_reward/std': 0.47682511806488037, 'reward': 0.34375, 'reward_std': 0.3066929578781128, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.019691556692123413, 'sampling/sampling_logp_difference/max': 15.211536407470703, 'sampling/importance_sampling_ratio/min': 2.4757892447269114e-07, 'sampling/importance_sampling_ratio/mean': 0.9998990297317505, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9774708449840546, 'clip_ratio/low_mean': 5.562954720517155e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 2.196683112742903e-06, 'clip_ratio/high_max': 8.786732450971613e-06, 'clip_ratio/region_mean': 5.7826231113722315e-05, 'epoch': 0.14}
+
+ 15%|█▍        | 150/1024 [6:21:04<40:12:20, 165.61s/it][AINFO 12-02 06:34:37 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 06:34:37 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 06:34:37 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 06:34:37 [block_pool.py:292] Successfully reset prefix cache
+
+ 15%|█▍        | 151/1024 [6:23:28<38:36:13, 159.19s/it][A
+                                                        [A{'loss': 0.0187, 'grad_norm': 0.0022214846685528755, 'learning_rate': 1e-05, 'num_tokens': 114928047.0, 'completions/mean_length': 5568.15625, 'completions/min_length': 271.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 5396.4765625, 'completions/min_terminated_length': 271.0, 'completions/max_terminated_length': 15549.0, 'rewards/accuracy_reward/mean': 0.234375, 'rewards/accuracy_reward/std': 0.42527204751968384, 'reward': 0.234375, 'reward_std': 0.2585597634315491, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.01938418298959732, 'sampling/sampling_logp_difference/max': 10.749968528747559, 'sampling/importance_sampling_ratio/min': 2.1446083337650634e-05, 'sampling/importance_sampling_ratio/mean': 0.9999408721923828, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9303529411554337, 'clip_ratio/low_mean': 2.6389980291696702e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 4.498708108258143e-06, 'clip_ratio/high_max': 1.799483243303257e-05, 'clip_ratio/region_mean': 3.0888688343111426e-05, 'epoch': 0.14}
+
+ 15%|█▍        | 151/1024 [6:23:28<38:36:13, 159.19s/it][AINFO 12-02 06:37:01 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 06:37:01 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 06:37:01 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 06:37:01 [block_pool.py:292] Successfully reset prefix cache
+
+ 15%|█▍        | 152/1024 [6:25:49<37:16:19, 153.88s/it][A
+                                                        [A{'loss': 0.0599, 'grad_norm': 0.002448044717311859, 'learning_rate': 1e-05, 'num_tokens': 115725657.0, 'completions/mean_length': 6086.578125, 'completions/min_length': 919.0, 'completions/max_length': 15340.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 6086.578125, 'completions/min_terminated_length': 919.0, 'completions/max_terminated_length': 15340.0, 'rewards/accuracy_reward/mean': 0.40625, 'rewards/accuracy_reward/std': 0.4930621087551117, 'reward': 0.40625, 'reward_std': 0.35878273844718933, 'frac_reward_zero_std': 0.1875, 'sampling/sampling_logp_difference/mean': 0.019298439845442772, 'sampling/sampling_logp_difference/max': 3.530261278152466, 'sampling/importance_sampling_ratio/min': 0.02929726243019104, 'sampling/importance_sampling_ratio/mean': 0.9999779462814331, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9131873697042465, 'clip_ratio/low_mean': 5.885063319510664e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 2.9893733426433755e-06, 'clip_ratio/high_max': 1.1957493370573502e-05, 'clip_ratio/region_mean': 6.184000585562899e-05, 'epoch': 0.14}
+
+ 15%|█▍        | 152/1024 [6:25:49<37:16:19, 153.88s/it][AINFO 12-02 06:39:22 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 06:39:22 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 06:39:22 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 06:39:22 [block_pool.py:292] Successfully reset prefix cache
+
+ 15%|█▍        | 153/1024 [6:28:34<38:02:36, 157.24s/it][A
+                                                        [A{'loss': 0.0281, 'grad_norm': 0.002013204852119088, 'learning_rate': 1e-05, 'num_tokens': 116571478.0, 'completions/mean_length': 6442.5390625, 'completions/min_length': 574.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 6203.9443359375, 'completions/min_terminated_length': 574.0, 'completions/max_terminated_length': 15987.0, 'rewards/accuracy_reward/mean': 0.2734375, 'rewards/accuracy_reward/std': 0.447474867105484, 'reward': 0.2734375, 'reward_std': 0.2909066081047058, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.01925014518201351, 'sampling/sampling_logp_difference/max': 13.778777122497559, 'sampling/importance_sampling_ratio/min': 1.0374163821325055e-06, 'sampling/importance_sampling_ratio/mean': 1.000044584274292, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8959419652819633, 'clip_ratio/low_mean': 5.717015119444113e-05, 'clip_ratio/low_min': 3.4328400033700746e-06, 'clip_ratio/high_mean': 3.3463394402133417e-06, 'clip_ratio/high_max': 1.3385357760853367e-05, 'clip_ratio/region_mean': 6.0516490520967636e-05, 'epoch': 0.14}
+
+ 15%|█▍        | 153/1024 [6:28:34<38:02:36, 157.24s/it][AINFO 12-02 06:42:07 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 06:42:07 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 06:42:07 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 06:42:07 [block_pool.py:292] Successfully reset prefix cache
+
+ 15%|█▌        | 154/1024 [6:31:26<39:02:47, 161.57s/it][A
+                                                        [A{'loss': 0.056, 'grad_norm': 0.0024523327592760324, 'learning_rate': 1e-05, 'num_tokens': 117440743.0, 'completions/mean_length': 6633.5703125, 'completions/min_length': 358.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 6319.0400390625, 'completions/min_terminated_length': 358.0, 'completions/max_terminated_length': 15946.0, 'rewards/accuracy_reward/mean': 0.3203125, 'rewards/accuracy_reward/std': 0.4684300124645233, 'reward': 0.3203125, 'reward_std': 0.30061954259872437, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.02061290666460991, 'sampling/sampling_logp_difference/max': 10.413415908813477, 'sampling/importance_sampling_ratio/min': 3.0026931199245155e-05, 'sampling/importance_sampling_ratio/mean': 0.9999213218688965, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.0223619118332863, 'clip_ratio/low_mean': 2.9738095065567904e-05, 'clip_ratio/low_min': 3.7240065466903616e-06, 'clip_ratio/high_mean': 3.136903728773177e-06, 'clip_ratio/high_max': 9.34224021875707e-06, 'clip_ratio/region_mean': 3.2874999135401595e-05, 'epoch': 0.14}
+
+ 15%|█▌        | 154/1024 [6:31:26<39:02:47, 161.57s/it][AINFO 12-02 06:44:59 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 06:44:59 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 06:44:59 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 06:44:59 [block_pool.py:292] Successfully reset prefix cache
+
+ 15%|█▌        | 155/1024 [6:34:13<39:23:12, 163.17s/it][A
+                                                        [A{'loss': 0.0395, 'grad_norm': 0.0018688985146582127, 'learning_rate': 1e-05, 'num_tokens': 118380687.0, 'completions/mean_length': 7183.0, 'completions/min_length': 357.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 6886.193359375, 'completions/min_terminated_length': 357.0, 'completions/max_terminated_length': 15919.0, 'rewards/accuracy_reward/mean': 0.2890625, 'rewards/accuracy_reward/std': 0.45510825514793396, 'reward': 0.2890625, 'reward_std': 0.2498900145292282, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.019792160019278526, 'sampling/sampling_logp_difference/max': 11.187394142150879, 'sampling/importance_sampling_ratio/min': 1.3847662557964213e-05, 'sampling/importance_sampling_ratio/mean': 0.9999039173126221, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9815369099378586, 'clip_ratio/low_mean': 3.954866042477079e-05, 'clip_ratio/low_min': 9.874949228105834e-06, 'clip_ratio/high_mean': 3.6343708416097797e-06, 'clip_ratio/high_max': 1.4537483366439119e-05, 'clip_ratio/region_mean': 4.318303126638057e-05, 'epoch': 0.14}
+
+ 15%|█▌        | 155/1024 [6:34:13<39:23:12, 163.17s/it][AINFO 12-02 06:47:46 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 06:47:46 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 06:47:46 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 06:47:46 [block_pool.py:292] Successfully reset prefix cache
+
+ 15%|█▌        | 156/1024 [6:36:56<39:20:39, 163.18s/it][A
+                                                        [A{'loss': 0.0041, 'grad_norm': 0.002005894435569644, 'learning_rate': 1e-05, 'num_tokens': 119207089.0, 'completions/mean_length': 6324.640625, 'completions/min_length': 45.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.046875, 'completions/mean_terminated_length': 5829.91748046875, 'completions/min_terminated_length': 45.0, 'completions/max_terminated_length': 16246.0, 'rewards/accuracy_reward/mean': 0.3984375, 'rewards/accuracy_reward/std': 0.4915000796318054, 'reward': 0.3984375, 'reward_std': 0.2858940362930298, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.01853565312922001, 'sampling/sampling_logp_difference/max': 14.362195014953613, 'sampling/importance_sampling_ratio/min': 5.788659223071591e-07, 'sampling/importance_sampling_ratio/mean': 1.0000035762786865, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.852975606918335, 'clip_ratio/low_mean': 4.9011068711024564e-05, 'clip_ratio/low_min': 1.0991705721608014e-05, 'clip_ratio/high_mean': 1.7914090904014301e-06, 'clip_ratio/high_max': 7.165636361605721e-06, 'clip_ratio/region_mean': 5.0802477687739156e-05, 'epoch': 0.14}
+
+ 15%|█▌        | 156/1024 [6:36:56<39:20:39, 163.18s/it][AINFO 12-02 06:50:29 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 06:50:29 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 06:50:29 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 06:50:29 [block_pool.py:292] Successfully reset prefix cache
+
+ 15%|█▌        | 157/1024 [6:39:37<39:08:03, 162.50s/it][A
+                                                        [A{'loss': 0.0603, 'grad_norm': 0.002577397273853421, 'learning_rate': 1e-05, 'num_tokens': 119961895.0, 'completions/mean_length': 5723.421875, 'completions/min_length': 355.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0390625, 'completions/mean_terminated_length': 5290.06494140625, 'completions/min_terminated_length': 355.0, 'completions/max_terminated_length': 16281.0, 'rewards/accuracy_reward/mean': 0.390625, 'rewards/accuracy_reward/std': 0.4898075461387634, 'reward': 0.390625, 'reward_std': 0.34321609139442444, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.018341556191444397, 'sampling/sampling_logp_difference/max': 2.5405349731445312, 'sampling/importance_sampling_ratio/min': 0.07882421463727951, 'sampling/importance_sampling_ratio/mean': 0.9999703764915466, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8744911625981331, 'clip_ratio/low_mean': 3.834237736555224e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 1.948848705524142e-06, 'clip_ratio/high_max': 7.795394822096569e-06, 'clip_ratio/region_mean': 4.0291225786859286e-05, 'epoch': 0.14}
+
+ 15%|█▌        | 157/1024 [6:39:37<39:08:03, 162.50s/it][AINFO 12-02 06:53:10 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 06:53:10 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 06:53:10 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 06:53:10 [block_pool.py:292] Successfully reset prefix cache
+
+ 15%|█▌        | 158/1024 [6:42:25<39:29:44, 164.18s/it][A
+                                                        [A{'loss': 0.0273, 'grad_norm': 0.002552987542003393, 'learning_rate': 1e-05, 'num_tokens': 120990289.0, 'completions/mean_length': 7886.015625, 'completions/min_length': 989.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 7682.064453125, 'completions/min_terminated_length': 989.0, 'completions/max_terminated_length': 16055.0, 'rewards/accuracy_reward/mean': 0.328125, 'rewards/accuracy_reward/std': 0.4713755249977112, 'reward': 0.328125, 'reward_std': 0.2522490322589874, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.02049873024225235, 'sampling/sampling_logp_difference/max': 7.013879776000977, 'sampling/importance_sampling_ratio/min': 0.000899312668479979, 'sampling/importance_sampling_ratio/mean': 1.0000030994415283, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9391767829656601, 'clip_ratio/low_mean': 2.636873176697918e-05, 'clip_ratio/low_min': 2.9339967113628518e-06, 'clip_ratio/high_mean': 2.303524297531112e-06, 'clip_ratio/high_max': 9.214097190124448e-06, 'clip_ratio/region_mean': 2.8672255837136618e-05, 'epoch': 0.15}
+
+ 15%|█▌        | 158/1024 [6:42:25<39:29:44, 164.18s/it][AINFO 12-02 06:55:58 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 06:55:58 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 06:55:58 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 06:55:58 [block_pool.py:292] Successfully reset prefix cache
+
+ 16%|█▌        | 159/1024 [6:45:13<39:40:38, 165.13s/it][A
+                                                        [A{'loss': 0.039, 'grad_norm': 0.002678362652659416, 'learning_rate': 1e-05, 'num_tokens': 121797958.0, 'completions/mean_length': 6173.1640625, 'completions/min_length': 445.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 6011.087890625, 'completions/min_terminated_length': 445.0, 'completions/max_terminated_length': 16276.0, 'rewards/accuracy_reward/mean': 0.4140625, 'rewards/accuracy_reward/std': 0.49449479579925537, 'reward': 0.4140625, 'reward_std': 0.3608373999595642, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.018977735191583633, 'sampling/sampling_logp_difference/max': 6.207672119140625, 'sampling/importance_sampling_ratio/min': 0.002013920107856393, 'sampling/importance_sampling_ratio/mean': 0.9999265074729919, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9148785546422005, 'clip_ratio/low_mean': 3.168332909808669e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 9.72330332160709e-06, 'clip_ratio/high_max': 3.406416203688423e-05, 'clip_ratio/region_mean': 4.140663151019908e-05, 'epoch': 0.15}
+
+ 16%|█▌        | 159/1024 [6:45:13<39:40:38, 165.13s/it][AINFO 12-02 06:58:45 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 06:58:45 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 06:58:46 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 06:58:46 [block_pool.py:292] Successfully reset prefix cache
+
+ 16%|█▌        | 160/1024 [6:48:27<41:45:35, 174.00s/it][A
+                                                        [A{'loss': 0.026, 'grad_norm': 0.0033711253199726343, 'learning_rate': 1e-05, 'num_tokens': 122661170.0, 'completions/mean_length': 6594.21875, 'completions/min_length': 196.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0390625, 'completions/mean_terminated_length': 6196.259765625, 'completions/min_terminated_length': 196.0, 'completions/max_terminated_length': 15825.0, 'rewards/accuracy_reward/mean': 0.3828125, 'rewards/accuracy_reward/std': 0.4879830479621887, 'reward': 0.3828125, 'reward_std': 0.30457615852355957, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.01938377134501934, 'sampling/sampling_logp_difference/max': 8.122310638427734, 'sampling/importance_sampling_ratio/min': 0.0002968419576063752, 'sampling/importance_sampling_ratio/mean': 0.9998981356620789, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9486038386821747, 'clip_ratio/low_mean': 4.459614581264759e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 4.6191471483325586e-06, 'clip_ratio/high_max': 1.8476588593330234e-05, 'clip_ratio/region_mean': 4.9215293188353826e-05, 'epoch': 0.15}
+
+ 16%|█▌        | 160/1024 [6:48:27<41:45:35, 174.00s/it][AINFO 12-02 07:02:00 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 07:02:00 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 07:02:00 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 07:02:00 [block_pool.py:292] Successfully reset prefix cache
+
+ 16%|█▌        | 161/1024 [6:51:19<41:30:44, 173.17s/it][A
+                                                        [A{'loss': 0.0804, 'grad_norm': 0.0017857529455795884, 'learning_rate': 1e-05, 'num_tokens': 123518107.0, 'completions/mean_length': 6533.9453125, 'completions/min_length': 524.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 6377.595703125, 'completions/min_terminated_length': 524.0, 'completions/max_terminated_length': 15928.0, 'rewards/accuracy_reward/mean': 0.34375, 'rewards/accuracy_reward/std': 0.47682511806488037, 'reward': 0.34375, 'reward_std': 0.3356248140335083, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.02010391652584076, 'sampling/sampling_logp_difference/max': 11.616875648498535, 'sampling/importance_sampling_ratio/min': 9.012701411847956e-06, 'sampling/importance_sampling_ratio/mean': 0.9998549818992615, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9986584335565567, 'clip_ratio/low_mean': 5.420079878604156e-05, 'clip_ratio/low_min': 4.594068286678521e-06, 'clip_ratio/high_mean': 2.7343705824023345e-06, 'clip_ratio/high_max': 7.97335997049231e-06, 'clip_ratio/region_mean': 5.693517005056492e-05, 'epoch': 0.15}
+
+ 16%|█▌        | 161/1024 [6:51:19<41:30:44, 173.17s/it][AINFO 12-02 07:04:51 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 07:04:51 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 07:04:51 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 07:04:51 [block_pool.py:292] Successfully reset prefix cache
+
+ 16%|█▌        | 162/1024 [6:53:55<40:16:04, 168.17s/it][A
+                                                        [A{'loss': -0.0237, 'grad_norm': 0.0018227624241262674, 'learning_rate': 1e-05, 'num_tokens': 124279031.0, 'completions/mean_length': 5742.21875, 'completions/min_length': 235.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 5658.42529296875, 'completions/min_terminated_length': 235.0, 'completions/max_terminated_length': 13212.0, 'rewards/accuracy_reward/mean': 0.21875, 'rewards/accuracy_reward/std': 0.41502299904823303, 'reward': 0.21875, 'reward_std': 0.2869548797607422, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.019987668842077255, 'sampling/sampling_logp_difference/max': 6.16689920425415, 'sampling/importance_sampling_ratio/min': 0.0020977305248379707, 'sampling/importance_sampling_ratio/mean': 0.9998506903648376, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.0379670709371567, 'clip_ratio/low_mean': 3.5141094485879876e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 1.1176281304869917e-06, 'clip_ratio/high_max': 4.470512521947967e-06, 'clip_ratio/region_mean': 3.625872295742738e-05, 'epoch': 0.15}
+
+ 16%|█▌        | 162/1024 [6:53:55<40:16:04, 168.17s/it][AINFO 12-02 07:07:28 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 07:07:28 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 07:07:28 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 07:07:28 [block_pool.py:292] Successfully reset prefix cache
+
+ 16%|█▌        | 163/1024 [6:56:58<41:15:48, 172.53s/it][A
+                                                        [A{'loss': 0.0837, 'grad_norm': 0.002219022251665592, 'learning_rate': 1e-05, 'num_tokens': 125270761.0, 'completions/mean_length': 7584.703125, 'completions/min_length': 884.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 7515.41748046875, 'completions/min_terminated_length': 884.0, 'completions/max_terminated_length': 16354.0, 'rewards/accuracy_reward/mean': 0.359375, 'rewards/accuracy_reward/std': 0.481702595949173, 'reward': 0.359375, 'reward_std': 0.37033066153526306, 'frac_reward_zero_std': 0.1875, 'sampling/sampling_logp_difference/mean': 0.020291510969400406, 'sampling/sampling_logp_difference/max': 5.997514247894287, 'sampling/importance_sampling_ratio/min': 0.0024849213659763336, 'sampling/importance_sampling_ratio/mean': 0.9999880790710449, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.953459307551384, 'clip_ratio/low_mean': 5.731009014198207e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 3.21091931709816e-06, 'clip_ratio/high_max': 1.0003542683989508e-05, 'clip_ratio/region_mean': 6.0521009800140746e-05, 'epoch': 0.15}
+
+ 16%|█▌        | 163/1024 [6:56:58<41:15:48, 172.53s/it][AINFO 12-02 07:10:31 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 07:10:31 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 07:10:31 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 07:10:31 [block_pool.py:292] Successfully reset prefix cache
+
+ 16%|█▌        | 164/1024 [6:59:39<40:23:02, 169.05s/it][A
+                                                        [A{'loss': 0.0501, 'grad_norm': 0.0014189074281603098, 'learning_rate': 1e-05, 'num_tokens': 125895279.0, 'completions/mean_length': 4714.671875, 'completions/min_length': 371.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 4622.78759765625, 'completions/min_terminated_length': 371.0, 'completions/max_terminated_length': 16133.0, 'rewards/accuracy_reward/mean': 0.3984375, 'rewards/accuracy_reward/std': 0.4915000796318054, 'reward': 0.3984375, 'reward_std': 0.28383445739746094, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.018739396706223488, 'sampling/sampling_logp_difference/max': 14.727458000183105, 'sampling/importance_sampling_ratio/min': 4.017410901724361e-07, 'sampling/importance_sampling_ratio/mean': 0.9999479651451111, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.018719919025898, 'clip_ratio/low_mean': 3.1135301298945706e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 1.9336673631187296e-06, 'clip_ratio/high_max': 7.734669452474918e-06, 'clip_ratio/region_mean': 3.3068968605221016e-05, 'epoch': 0.15}
+
+ 16%|█▌        | 164/1024 [6:59:39<40:23:02, 169.05s/it][AINFO 12-02 07:13:12 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 07:13:12 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 07:13:12 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 07:13:12 [block_pool.py:292] Successfully reset prefix cache
+
+ 16%|█▌        | 165/1024 [7:02:18<39:40:10, 166.25s/it][A
+                                                        [A{'loss': 0.1007, 'grad_norm': 0.001702460227534175, 'learning_rate': 1e-05, 'num_tokens': 126722881.0, 'completions/mean_length': 6316.140625, 'completions/min_length': 751.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 6074.51220703125, 'completions/min_terminated_length': 751.0, 'completions/max_terminated_length': 15913.0, 'rewards/accuracy_reward/mean': 0.4609375, 'rewards/accuracy_reward/std': 0.5004304051399231, 'reward': 0.4609375, 'reward_std': 0.3056321144104004, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.01929408684372902, 'sampling/sampling_logp_difference/max': 6.680510997772217, 'sampling/importance_sampling_ratio/min': 0.0012551364488899708, 'sampling/importance_sampling_ratio/mean': 0.9999539852142334, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9325072392821312, 'clip_ratio/low_mean': 3.824179225375701e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 2.5173937956424197e-06, 'clip_ratio/high_max': 1.0069575182569679e-05, 'clip_ratio/region_mean': 4.0759185367278405e-05, 'epoch': 0.15}
+
+ 16%|█▌        | 165/1024 [7:02:18<39:40:10, 166.25s/it][AINFO 12-02 07:15:51 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 07:15:51 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 07:15:51 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 07:15:51 [block_pool.py:292] Successfully reset prefix cache
+
+ 16%|█▌        | 166/1024 [7:04:43<38:03:39, 159.70s/it][A
+                                                        [A{'loss': 0.0078, 'grad_norm': 0.0032931750174611807, 'learning_rate': 1e-05, 'num_tokens': 127341715.0, 'completions/mean_length': 4687.140625, 'completions/min_length': 310.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 4595.03955078125, 'completions/min_terminated_length': 310.0, 'completions/max_terminated_length': 15832.0, 'rewards/accuracy_reward/mean': 0.28125, 'rewards/accuracy_reward/std': 0.4513758420944214, 'reward': 0.28125, 'reward_std': 0.2835350036621094, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.020621225237846375, 'sampling/sampling_logp_difference/max': 6.246901512145996, 'sampling/importance_sampling_ratio/min': 0.0019364450126886368, 'sampling/importance_sampling_ratio/mean': 0.9999821186065674, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.0886607319116592, 'clip_ratio/low_mean': 3.119859468370123e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 1.718260250527237e-06, 'clip_ratio/high_max': 6.873041002108948e-06, 'clip_ratio/region_mean': 3.291685527528898e-05, 'epoch': 0.15}
+
+ 16%|█▌        | 166/1024 [7:04:43<38:03:39, 159.70s/it][AINFO 12-02 07:18:16 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 07:18:16 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 07:18:16 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 07:18:16 [block_pool.py:292] Successfully reset prefix cache
+
+ 16%|█▋        | 167/1024 [7:07:11<37:12:33, 156.31s/it][A
+                                                        [A{'loss': -0.0414, 'grad_norm': 0.0031696646474301815, 'learning_rate': 1e-05, 'num_tokens': 128093597.0, 'completions/mean_length': 5705.515625, 'completions/min_length': 4.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 5449.232421875, 'completions/min_terminated_length': 4.0, 'completions/max_terminated_length': 14125.0, 'rewards/accuracy_reward/mean': 0.1953125, 'rewards/accuracy_reward/std': 0.3979988098144531, 'reward': 0.1953125, 'reward_std': 0.21778053045272827, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.021961934864521027, 'sampling/sampling_logp_difference/max': 10.350459098815918, 'sampling/importance_sampling_ratio/min': 3.197810656274669e-05, 'sampling/importance_sampling_ratio/mean': 0.9999619126319885, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.0523068830370903, 'clip_ratio/low_mean': 4.30743207289197e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 4.43271312633442e-06, 'clip_ratio/high_max': 1.773085250533768e-05, 'clip_ratio/region_mean': 4.7507033741567284e-05, 'epoch': 0.15}
+
+ 16%|█▋        | 167/1024 [7:07:11<37:12:33, 156.31s/it][AINFO 12-02 07:20:44 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 07:20:44 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 07:20:44 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 07:20:44 [block_pool.py:292] Successfully reset prefix cache
+
+ 16%|█▋        | 168/1024 [7:09:51<37:25:29, 157.39s/it][A
+                                                        [A{'loss': 0.075, 'grad_norm': 0.0023704832419753075, 'learning_rate': 1e-05, 'num_tokens': 128906948.0, 'completions/mean_length': 6214.4921875, 'completions/min_length': 533.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 6053.07177734375, 'completions/min_terminated_length': 533.0, 'completions/max_terminated_length': 15958.0, 'rewards/accuracy_reward/mean': 0.40625, 'rewards/accuracy_reward/std': 0.4930621087551117, 'reward': 0.40625, 'reward_std': 0.34139877557754517, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.01882763020694256, 'sampling/sampling_logp_difference/max': 7.999940872192383, 'sampling/importance_sampling_ratio/min': 0.0003354824730195105, 'sampling/importance_sampling_ratio/mean': 1.000023365020752, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9371421113610268, 'clip_ratio/low_mean': 5.0530389898995054e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 4.71476266739046e-06, 'clip_ratio/high_max': 1.885905066956184e-05, 'clip_ratio/region_mean': 5.524515336219338e-05, 'epoch': 0.15}
+
+ 16%|█▋        | 168/1024 [7:09:51<37:25:29, 157.39s/it][AINFO 12-02 07:23:24 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 07:23:24 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 07:23:24 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 07:23:24 [block_pool.py:292] Successfully reset prefix cache
+
+ 17%|█▋        | 169/1024 [7:12:46<38:36:48, 162.58s/it][A
+                                                        [A{'loss': 0.1197, 'grad_norm': 0.003370177699252963, 'learning_rate': 1e-05, 'num_tokens': 129839813.0, 'completions/mean_length': 7127.0703125, 'completions/min_length': 402.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 7054.18115234375, 'completions/min_terminated_length': 402.0, 'completions/max_terminated_length': 15865.0, 'rewards/accuracy_reward/mean': 0.359375, 'rewards/accuracy_reward/std': 0.481702595949173, 'reward': 0.359375, 'reward_std': 0.3329663574695587, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.019736800342798233, 'sampling/sampling_logp_difference/max': 11.43798828125, 'sampling/importance_sampling_ratio/min': 1.077816432371037e-05, 'sampling/importance_sampling_ratio/mean': 0.999907910823822, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9854387491941452, 'clip_ratio/low_mean': 4.5897569179942366e-05, 'clip_ratio/low_min': 8.727477506909054e-06, 'clip_ratio/high_mean': 7.60518054221393e-06, 'clip_ratio/high_max': 3.042072216885572e-05, 'clip_ratio/region_mean': 5.3502750233747065e-05, 'epoch': 0.16}
+
+ 17%|█▋        | 169/1024 [7:12:46<38:36:48, 162.58s/it][AINFO 12-02 07:26:19 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 07:26:19 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 07:26:19 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 07:26:19 [block_pool.py:292] Successfully reset prefix cache
+
+ 17%|█▋        | 170/1024 [7:15:34<38:59:17, 164.35s/it][A
+                                                        [A{'loss': 0.0661, 'grad_norm': 0.0026055986527353525, 'learning_rate': 1e-05, 'num_tokens': 130698370.0, 'completions/mean_length': 6566.2890625, 'completions/min_length': 969.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 6330.6640625, 'completions/min_terminated_length': 969.0, 'completions/max_terminated_length': 15865.0, 'rewards/accuracy_reward/mean': 0.5, 'rewards/accuracy_reward/std': 0.5019646286964417, 'reward': 0.5, 'reward_std': 0.36295419931411743, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.01787097379565239, 'sampling/sampling_logp_difference/max': 8.074028015136719, 'sampling/importance_sampling_ratio/min': 0.00031152591691352427, 'sampling/importance_sampling_ratio/mean': 0.9999133944511414, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.7978609576821327, 'clip_ratio/low_mean': 3.2797592325550795e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 6.243764005375851e-06, 'clip_ratio/high_max': 2.1401074718596647e-05, 'clip_ratio/region_mean': 3.904135610355297e-05, 'epoch': 0.16}
+
+ 17%|█▋        | 170/1024 [7:15:34<38:59:17, 164.35s/it][AINFO 12-02 07:29:07 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 07:29:07 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 07:29:07 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 07:29:07 [block_pool.py:292] Successfully reset prefix cache
+
+ 17%|█▋        | 171/1024 [7:18:23<39:16:45, 165.77s/it][A
+                                                        [A{'loss': 0.0683, 'grad_norm': 0.0023866184055805206, 'learning_rate': 1e-05, 'num_tokens': 131637439.0, 'completions/mean_length': 7186.2890625, 'completions/min_length': 351.0, 'completions/max_length': 15576.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 7186.2890625, 'completions/min_terminated_length': 351.0, 'completions/max_terminated_length': 15576.0, 'rewards/accuracy_reward/mean': 0.2734375, 'rewards/accuracy_reward/std': 0.447474867105484, 'reward': 0.2734375, 'reward_std': 0.2059282809495926, 'frac_reward_zero_std': 0.5625, 'sampling/sampling_logp_difference/mean': 0.02137116715312004, 'sampling/sampling_logp_difference/max': 7.211773872375488, 'sampling/importance_sampling_ratio/min': 0.0007378471200354397, 'sampling/importance_sampling_ratio/mean': 0.9999207258224487, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.0232757329940796, 'clip_ratio/low_mean': 2.0564424403346493e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 0.0, 'clip_ratio/high_max': 0.0, 'clip_ratio/region_mean': 2.0564424403346493e-05, 'epoch': 0.16}
+
+ 17%|█▋        | 171/1024 [7:18:23<39:16:45, 165.77s/it][AINFO 12-02 07:31:56 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 07:31:56 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 07:31:56 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 07:31:56 [block_pool.py:292] Successfully reset prefix cache
+
+ 17%|█▋        | 172/1024 [7:20:57<38:21:05, 162.05s/it][A
+                                                        [A{'loss': 0.0699, 'grad_norm': 0.003491115989163518, 'learning_rate': 1e-05, 'num_tokens': 132371816.0, 'completions/mean_length': 5591.5703125, 'completions/min_length': 635.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 5420.26220703125, 'completions/min_terminated_length': 635.0, 'completions/max_terminated_length': 13638.0, 'rewards/accuracy_reward/mean': 0.5, 'rewards/accuracy_reward/std': 0.5019646286964417, 'reward': 0.5, 'reward_std': 0.3406373858451843, 'frac_reward_zero_std': 0.1875, 'sampling/sampling_logp_difference/mean': 0.018760837614536285, 'sampling/sampling_logp_difference/max': 8.998766899108887, 'sampling/importance_sampling_ratio/min': 0.00012356207298580557, 'sampling/importance_sampling_ratio/mean': 0.999891459941864, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9335208311676979, 'clip_ratio/low_mean': 5.8380828136250784e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 1.0094751814904157e-05, 'clip_ratio/high_max': 4.037900725961663e-05, 'clip_ratio/region_mean': 6.847557995115494e-05, 'epoch': 0.16}
+
+ 17%|█▋        | 172/1024 [7:20:57<38:21:05, 162.05s/it][AINFO 12-02 07:34:30 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 07:34:30 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 07:34:30 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 07:34:30 [block_pool.py:292] Successfully reset prefix cache
+
+ 17%|█▋        | 173/1024 [7:23:41<38:29:16, 162.82s/it][A
+                                                        [A{'loss': 0.0526, 'grad_norm': 0.002830669516697526, 'learning_rate': 1e-05, 'num_tokens': 133307297.0, 'completions/mean_length': 7152.3828125, 'completions/min_length': 281.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 6930.82421875, 'completions/min_terminated_length': 281.0, 'completions/max_terminated_length': 16302.0, 'rewards/accuracy_reward/mean': 0.28125, 'rewards/accuracy_reward/std': 0.4513758420944214, 'reward': 0.28125, 'reward_std': 0.28801077604293823, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.021548541262745857, 'sampling/sampling_logp_difference/max': 8.179040908813477, 'sampling/importance_sampling_ratio/min': 0.00028047082014381886, 'sampling/importance_sampling_ratio/mean': 0.9999501705169678, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.1329835206270218, 'clip_ratio/low_mean': 4.4085751369493664e-05, 'clip_ratio/low_min': 6.7955093072669115e-06, 'clip_ratio/high_mean': 7.094694183251704e-07, 'clip_ratio/high_max': 2.8378776733006816e-06, 'clip_ratio/region_mean': 4.4795220674132e-05, 'epoch': 0.16}
+
+ 17%|█▋        | 173/1024 [7:23:41<38:29:16, 162.82s/it][AINFO 12-02 07:37:14 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 07:37:14 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 07:37:14 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 07:37:14 [block_pool.py:292] Successfully reset prefix cache
+
+ 17%|█▋        | 174/1024 [7:26:24<38:27:26, 162.88s/it][A
+                                                        [A{'loss': 0.0541, 'grad_norm': 0.0027821618132293224, 'learning_rate': 1e-05, 'num_tokens': 134260107.0, 'completions/mean_length': 7263.453125, 'completions/min_length': 352.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 7118.68310546875, 'completions/min_terminated_length': 352.0, 'completions/max_terminated_length': 15068.0, 'rewards/accuracy_reward/mean': 0.3203125, 'rewards/accuracy_reward/std': 0.4684300124645233, 'reward': 0.3203125, 'reward_std': 0.2858940362930298, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.022098438814282417, 'sampling/sampling_logp_difference/max': 9.454667091369629, 'sampling/importance_sampling_ratio/min': 7.832317351130769e-05, 'sampling/importance_sampling_ratio/mean': 0.999946117401123, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.092760555446148, 'clip_ratio/low_mean': 3.4009618616437365e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 2.537707359806518e-06, 'clip_ratio/high_max': 1.0150829439226072e-05, 'clip_ratio/region_mean': 3.654732597624388e-05, 'epoch': 0.16}
+
+ 17%|█▋        | 174/1024 [7:26:24<38:27:26, 162.88s/it][AINFO 12-02 07:39:57 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 07:39:57 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 07:39:57 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 07:39:57 [block_pool.py:292] Successfully reset prefix cache
+
+ 17%|█▋        | 175/1024 [7:29:20<39:20:46, 166.84s/it][A
+                                                        [A{'loss': -0.0197, 'grad_norm': 0.0007076738984324038, 'learning_rate': 1e-05, 'num_tokens': 135186139.0, 'completions/mean_length': 7088.8125, 'completions/min_length': 1314.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0390625, 'completions/mean_terminated_length': 6710.958984375, 'completions/min_terminated_length': 1314.0, 'completions/max_terminated_length': 15388.0, 'rewards/accuracy_reward/mean': 0.328125, 'rewards/accuracy_reward/std': 0.4713755249977112, 'reward': 0.328125, 'reward_std': 0.20593319833278656, 'frac_reward_zero_std': 0.5625, 'sampling/sampling_logp_difference/mean': 0.020075790584087372, 'sampling/sampling_logp_difference/max': 10.386486053466797, 'sampling/importance_sampling_ratio/min': 3.084653872065246e-05, 'sampling/importance_sampling_ratio/mean': 0.9998199343681335, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.0669445469975471, 'clip_ratio/low_mean': 1.6864279416495265e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 2.6404690061099245e-06, 'clip_ratio/high_max': 1.0561876024439698e-05, 'clip_ratio/region_mean': 1.9504748649978865e-05, 'epoch': 0.16}
+
+ 17%|█▋        | 175/1024 [7:29:20<39:20:46, 166.84s/it][AINFO 12-02 07:42:53 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 07:42:53 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 07:42:53 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 07:42:53 [block_pool.py:292] Successfully reset prefix cache
+
+ 17%|█▋        | 176/1024 [7:31:33<36:51:16, 156.46s/it][A
+                                                        [A{'loss': 0.0261, 'grad_norm': 0.0022445612121373415, 'learning_rate': 1e-05, 'num_tokens': 135888929.0, 'completions/mean_length': 5352.734375, 'completions/min_length': 333.0, 'completions/max_length': 15626.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 5352.734375, 'completions/min_terminated_length': 333.0, 'completions/max_terminated_length': 15626.0, 'rewards/accuracy_reward/mean': 0.4765625, 'rewards/accuracy_reward/std': 0.5014128684997559, 'reward': 0.4765625, 'reward_std': 0.399257630109787, 'frac_reward_zero_std': 0.125, 'sampling/sampling_logp_difference/mean': 0.02010166086256504, 'sampling/sampling_logp_difference/max': 8.029678344726562, 'sampling/importance_sampling_ratio/min': 0.00032565294532105327, 'sampling/importance_sampling_ratio/mean': 0.9999054670333862, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.0387161895632744, 'clip_ratio/low_mean': 2.7592465016823553e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 1.7737540929374518e-06, 'clip_ratio/high_max': 7.095016371749807e-06, 'clip_ratio/region_mean': 2.936621888238733e-05, 'epoch': 0.16}
+
+ 17%|█▋        | 176/1024 [7:31:33<36:51:16, 156.46s/it][AINFO 12-02 07:45:06 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 07:45:06 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 07:45:06 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 07:45:06 [block_pool.py:292] Successfully reset prefix cache
+
+ 17%|█▋        | 177/1024 [7:34:29<38:14:38, 162.55s/it][A
+                                                        [A{'loss': 0.0621, 'grad_norm': 0.0022021254990249872, 'learning_rate': 1e-05, 'num_tokens': 136901941.0, 'completions/mean_length': 7758.90625, 'completions/min_length': 742.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0390625, 'completions/mean_terminated_length': 7408.29248046875, 'completions/min_terminated_length': 742.0, 'completions/max_terminated_length': 16266.0, 'rewards/accuracy_reward/mean': 0.3671875, 'rewards/accuracy_reward/std': 0.4839322865009308, 'reward': 0.3671875, 'reward_std': 0.2914257347583771, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.021426808089017868, 'sampling/sampling_logp_difference/max': 15.30886173248291, 'sampling/importance_sampling_ratio/min': 2.2461865967216e-07, 'sampling/importance_sampling_ratio/mean': 0.9999858140945435, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.0648984238505363, 'clip_ratio/low_mean': 2.7625993425317574e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 4.426987970873597e-06, 'clip_ratio/high_max': 1.5100852124305675e-05, 'clip_ratio/region_mean': 3.2052981168817496e-05, 'epoch': 0.16}
+
+ 17%|█▋        | 177/1024 [7:34:29<38:14:38, 162.55s/it][AINFO 12-02 07:48:02 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 07:48:02 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 07:48:02 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 07:48:02 [block_pool.py:292] Successfully reset prefix cache
+
+ 17%|█▋        | 178/1024 [7:37:31<39:30:38, 168.13s/it][A
+                                                        [A{'loss': 0.035, 'grad_norm': 0.0013461806811392307, 'learning_rate': 1e-05, 'num_tokens': 137824623.0, 'completions/mean_length': 7036.953125, 'completions/min_length': 4.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0546875, 'completions/mean_terminated_length': 6496.21484375, 'completions/min_terminated_length': 4.0, 'completions/max_terminated_length': 16277.0, 'rewards/accuracy_reward/mean': 0.34375, 'rewards/accuracy_reward/std': 0.47682511806488037, 'reward': 0.34375, 'reward_std': 0.2546031177043915, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.020304443314671516, 'sampling/sampling_logp_difference/max': 9.74915885925293, 'sampling/importance_sampling_ratio/min': 5.834372132085264e-05, 'sampling/importance_sampling_ratio/mean': 0.999944806098938, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9684997871518135, 'clip_ratio/low_mean': 2.2069365058996482e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 7.4063813144675805e-06, 'clip_ratio/high_max': 2.5346608254039893e-05, 'clip_ratio/region_mean': 2.9475746259777225e-05, 'epoch': 0.16}
+
+ 17%|█▋        | 178/1024 [7:37:31<39:30:38, 168.13s/it][AINFO 12-02 07:51:03 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 07:51:03 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 07:51:03 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 07:51:03 [block_pool.py:292] Successfully reset prefix cache
+
+ 17%|█▋        | 179/1024 [7:40:10<38:50:09, 165.45s/it][A
+                                                        [A{'loss': 0.0943, 'grad_norm': 0.002132438588887453, 'learning_rate': 1e-05, 'num_tokens': 138625247.0, 'completions/mean_length': 6114.1875, 'completions/min_length': 223.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 5951.1748046875, 'completions/min_terminated_length': 223.0, 'completions/max_terminated_length': 15500.0, 'rewards/accuracy_reward/mean': 0.40625, 'rewards/accuracy_reward/std': 0.4930621087551117, 'reward': 0.40625, 'reward_std': 0.321650892496109, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.020084267482161522, 'sampling/sampling_logp_difference/max': 6.361074447631836, 'sampling/importance_sampling_ratio/min': 0.0017275095451623201, 'sampling/importance_sampling_ratio/mean': 0.9999298453330994, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.943072073161602, 'clip_ratio/low_mean': 4.841489999307669e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 3.2869335200302885e-06, 'clip_ratio/high_max': 1.3147734080121154e-05, 'clip_ratio/region_mean': 5.170183294467279e-05, 'epoch': 0.16}
+
+ 17%|█▋        | 179/1024 [7:40:10<38:50:09, 165.45s/it][AINFO 12-02 07:53:43 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 07:53:43 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 07:53:43 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 07:53:43 [block_pool.py:292] Successfully reset prefix cache
+
+ 18%|█▊        | 180/1024 [7:43:06<39:30:39, 168.53s/it][A
+                                                        [A{'loss': 0.0666, 'grad_norm': 0.0026741649489849806, 'learning_rate': 1e-05, 'num_tokens': 139619287.0, 'completions/mean_length': 7618.875, 'completions/min_length': 1030.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0625, 'completions/mean_terminated_length': 7034.53369140625, 'completions/min_terminated_length': 1030.0, 'completions/max_terminated_length': 15050.0, 'rewards/accuracy_reward/mean': 0.2890625, 'rewards/accuracy_reward/std': 0.45510825514793396, 'reward': 0.2890625, 'reward_std': 0.2927239239215851, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.020061582326889038, 'sampling/sampling_logp_difference/max': 5.124405860900879, 'sampling/importance_sampling_ratio/min': 0.005949751473963261, 'sampling/importance_sampling_ratio/mean': 0.9998897314071655, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9142575263977051, 'clip_ratio/low_mean': 3.5252990301160025e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 4.468289489523158e-06, 'clip_ratio/high_max': 1.7873157958092634e-05, 'clip_ratio/region_mean': 3.9721279790683184e-05, 'epoch': 0.17}
+
+ 18%|█▊        | 180/1024 [7:43:06<39:30:39, 168.53s/it][AINFO 12-02 07:56:38 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 07:56:38 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 07:56:38 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 07:56:38 [block_pool.py:292] Successfully reset prefix cache
+
+ 18%|█▊        | 181/1024 [7:45:55<39:33:05, 168.90s/it][A
+                                                        [A{'loss': 0.0544, 'grad_norm': 0.0027016003150492907, 'learning_rate': 1e-05, 'num_tokens': 140318935.0, 'completions/mean_length': 5333.875, 'completions/min_length': 1109.0, 'completions/max_length': 16106.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 5333.875, 'completions/min_terminated_length': 1109.0, 'completions/max_terminated_length': 16106.0, 'rewards/accuracy_reward/mean': 0.5703125, 'rewards/accuracy_reward/std': 0.4969765841960907, 'reward': 0.5703125, 'reward_std': 0.2556639611721039, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.017069874331355095, 'sampling/sampling_logp_difference/max': 4.982499122619629, 'sampling/importance_sampling_ratio/min': 0.006856904830783606, 'sampling/importance_sampling_ratio/mean': 1.000013828277588, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8107482865452766, 'clip_ratio/low_mean': 4.5301517502593924e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 2.6280379188392544e-06, 'clip_ratio/high_max': 1.0512151675357018e-05, 'clip_ratio/region_mean': 4.792955542143318e-05, 'epoch': 0.17}
+
+ 18%|█▊        | 181/1024 [7:45:55<39:33:05, 168.90s/it][AINFO 12-02 07:59:28 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 07:59:28 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 07:59:28 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 07:59:28 [block_pool.py:292] Successfully reset prefix cache
+
+ 18%|█▊        | 182/1024 [7:48:51<39:58:51, 170.94s/it][A
+                                                        [A{'loss': 0.0466, 'grad_norm': 0.002655779244378209, 'learning_rate': 1e-05, 'num_tokens': 141293534.0, 'completions/mean_length': 7443.3046875, 'completions/min_length': 19.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 7154.89501953125, 'completions/min_terminated_length': 19.0, 'completions/max_terminated_length': 16050.0, 'rewards/accuracy_reward/mean': 0.234375, 'rewards/accuracy_reward/std': 0.42527204751968384, 'reward': 0.234375, 'reward_std': 0.2398776262998581, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.02006504125893116, 'sampling/sampling_logp_difference/max': 7.774807453155518, 'sampling/importance_sampling_ratio/min': 0.00042018835665658116, 'sampling/importance_sampling_ratio/mean': 0.9999659061431885, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9224414080381393, 'clip_ratio/low_mean': 5.6120721524166584e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 5.24943533264377e-06, 'clip_ratio/high_max': 1.85085939392593e-05, 'clip_ratio/region_mean': 6.137015702734061e-05, 'epoch': 0.17}
+
+ 18%|█▊        | 182/1024 [7:48:51<39:58:51, 170.94s/it][AINFO 12-02 08:02:24 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 08:02:24 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 08:02:24 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 08:02:24 [block_pool.py:292] Successfully reset prefix cache
+
+ 18%|█▊        | 183/1024 [7:51:07<37:28:37, 160.43s/it][A
+                                                        [A{'loss': 0.0288, 'grad_norm': 0.0016281780553981662, 'learning_rate': 1e-05, 'num_tokens': 142037438.0, 'completions/mean_length': 5646.6875, 'completions/min_length': 342.0, 'completions/max_length': 15923.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 5646.6875, 'completions/min_terminated_length': 342.0, 'completions/max_terminated_length': 15923.0, 'rewards/accuracy_reward/mean': 0.46875, 'rewards/accuracy_reward/std': 0.5009832978248596, 'reward': 0.46875, 'reward_std': 0.17912296950817108, 'frac_reward_zero_std': 0.625, 'sampling/sampling_logp_difference/mean': 0.019336247816681862, 'sampling/sampling_logp_difference/max': 7.46689510345459, 'sampling/importance_sampling_ratio/min': 0.0005717006279155612, 'sampling/importance_sampling_ratio/mean': 1.000030517578125, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8945339694619179, 'clip_ratio/low_mean': 2.2748562741981004e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 3.7355736139943474e-06, 'clip_ratio/high_max': 1.494229445597739e-05, 'clip_ratio/region_mean': 2.6484136355975352e-05, 'epoch': 0.17}
+
+ 18%|█▊        | 183/1024 [7:51:07<37:28:37, 160.43s/it][AINFO 12-02 08:04:40 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 08:04:40 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 08:04:40 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 08:04:40 [block_pool.py:292] Successfully reset prefix cache
+
+ 18%|█▊        | 184/1024 [7:53:55<37:59:34, 162.83s/it][A
+                                                        [A{'loss': 0.0011, 'grad_norm': 0.002166559686884284, 'learning_rate': 1e-05, 'num_tokens': 142873848.0, 'completions/mean_length': 6384.640625, 'completions/min_length': 130.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.046875, 'completions/mean_terminated_length': 5892.86865234375, 'completions/min_terminated_length': 130.0, 'completions/max_terminated_length': 14142.0, 'rewards/accuracy_reward/mean': 0.4765625, 'rewards/accuracy_reward/std': 0.5014128684997559, 'reward': 0.4765625, 'reward_std': 0.35506346821784973, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.018109092488884926, 'sampling/sampling_logp_difference/max': 12.249908447265625, 'sampling/importance_sampling_ratio/min': 4.785555574926548e-06, 'sampling/importance_sampling_ratio/mean': 1.0000462532043457, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.840093269944191, 'clip_ratio/low_mean': 3.5050728683927446e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 8.33997717109014e-06, 'clip_ratio/high_max': 3.335990868436056e-05, 'clip_ratio/region_mean': 4.339070608239126e-05, 'epoch': 0.17}
+
+ 18%|█▊        | 184/1024 [7:53:55<37:59:34, 162.83s/it][AINFO 12-02 08:07:28 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 08:07:28 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 08:07:28 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 08:07:28 [block_pool.py:292] Successfully reset prefix cache
+
+ 18%|█▊        | 185/1024 [7:56:35<37:44:08, 161.92s/it][A
+                                                        [A{'loss': -0.0086, 'grad_norm': 0.0021932912059128284, 'learning_rate': 1e-05, 'num_tokens': 143636152.0, 'completions/mean_length': 5775.0, 'completions/min_length': 1147.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 5691.46435546875, 'completions/min_terminated_length': 1147.0, 'completions/max_terminated_length': 14828.0, 'rewards/accuracy_reward/mean': 0.4375, 'rewards/accuracy_reward/std': 0.49802759289741516, 'reward': 0.4375, 'reward_std': 0.32325342297554016, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.019278086721897125, 'sampling/sampling_logp_difference/max': 18.44968605041504, 'sampling/importance_sampling_ratio/min': 9.714113069492214e-09, 'sampling/importance_sampling_ratio/mean': 1.000008225440979, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8915362879633904, 'clip_ratio/low_mean': 4.0552770769863855e-05, 'clip_ratio/low_min': 7.133888630050933e-06, 'clip_ratio/high_mean': 3.852763711620355e-06, 'clip_ratio/high_max': 1.541105484648142e-05, 'clip_ratio/region_mean': 4.440553459517105e-05, 'epoch': 0.17}
+
+ 18%|█▊        | 185/1024 [7:56:35<37:44:08, 161.92s/it][AINFO 12-02 08:10:08 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 08:10:08 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 08:10:08 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 08:10:08 [block_pool.py:292] Successfully reset prefix cache
+
+ 18%|█▊        | 186/1024 [7:59:15<37:32:53, 161.31s/it][A
+                                                        [A{'loss': 0.0242, 'grad_norm': 0.0022947140969336033, 'learning_rate': 1e-05, 'num_tokens': 144447370.0, 'completions/mean_length': 6181.640625, 'completions/min_length': 473.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 6019.69873046875, 'completions/min_terminated_length': 473.0, 'completions/max_terminated_length': 15122.0, 'rewards/accuracy_reward/mean': 0.234375, 'rewards/accuracy_reward/std': 0.42527204751968384, 'reward': 0.234375, 'reward_std': 0.2022808939218521, 'frac_reward_zero_std': 0.5625, 'sampling/sampling_logp_difference/mean': 0.02050788700580597, 'sampling/sampling_logp_difference/max': 16.416534423828125, 'sampling/importance_sampling_ratio/min': 7.419757253046555e-08, 'sampling/importance_sampling_ratio/mean': 0.9999147653579712, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.0544511675834656, 'clip_ratio/low_mean': 2.7509142171311396e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 0.0, 'clip_ratio/high_max': 0.0, 'clip_ratio/region_mean': 2.7509142171311396e-05, 'epoch': 0.17}
+
+ 18%|█▊        | 186/1024 [7:59:15<37:32:53, 161.31s/it][AINFO 12-02 08:12:48 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 08:12:48 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 08:12:48 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 08:12:48 [block_pool.py:292] Successfully reset prefix cache
+
+ 18%|█▊        | 187/1024 [8:01:57<37:34:26, 161.61s/it][A
+                                                        [A{'loss': 0.0977, 'grad_norm': 0.0034910975955426693, 'learning_rate': 1e-05, 'num_tokens': 145303505.0, 'completions/mean_length': 6542.3046875, 'completions/min_length': 628.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 6306.1044921875, 'completions/min_terminated_length': 628.0, 'completions/max_terminated_length': 15824.0, 'rewards/accuracy_reward/mean': 0.390625, 'rewards/accuracy_reward/std': 0.4898075461387634, 'reward': 0.390625, 'reward_std': 0.30433881282806396, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.020022759214043617, 'sampling/sampling_logp_difference/max': 4.931766986846924, 'sampling/importance_sampling_ratio/min': 0.007213745731860399, 'sampling/importance_sampling_ratio/mean': 0.9999945163726807, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.933225467801094, 'clip_ratio/low_mean': 2.4595847037289786e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 3.9252499846043065e-06, 'clip_ratio/high_max': 1.5700999938417226e-05, 'clip_ratio/region_mean': 2.8521096965050674e-05, 'epoch': 0.17}
+
+ 18%|█▊        | 187/1024 [8:01:57<37:34:26, 161.61s/it][AINFO 12-02 08:15:30 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 08:15:30 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 08:15:30 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 08:15:30 [block_pool.py:292] Successfully reset prefix cache
+
+ 18%|█▊        | 188/1024 [8:04:49<38:15:20, 164.74s/it][A
+                                                        [A{'loss': 0.0391, 'grad_norm': 0.0024078311398625374, 'learning_rate': 1e-05, 'num_tokens': 146082198.0, 'completions/mean_length': 5889.4765625, 'completions/min_length': 99.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 5637.6083984375, 'completions/min_terminated_length': 99.0, 'completions/max_terminated_length': 15365.0, 'rewards/accuracy_reward/mean': 0.3359375, 'rewards/accuracy_reward/std': 0.47417303919792175, 'reward': 0.3359375, 'reward_std': 0.2698703408241272, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.02060198038816452, 'sampling/sampling_logp_difference/max': 7.04927921295166, 'sampling/importance_sampling_ratio/min': 0.0008680344326421618, 'sampling/importance_sampling_ratio/mean': 0.9999341368675232, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9649673849344254, 'clip_ratio/low_mean': 2.61421698724007e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 1.5249853504428756e-06, 'clip_ratio/high_max': 6.0999414017715026e-06, 'clip_ratio/region_mean': 2.7667155109156738e-05, 'epoch': 0.17}
+
+ 18%|█▊        | 188/1024 [8:04:49<38:15:20, 164.74s/it][AINFO 12-02 08:18:22 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 08:18:22 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 08:18:22 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 08:18:22 [block_pool.py:292] Successfully reset prefix cache
+
+ 18%|█▊        | 189/1024 [8:07:32<38:02:46, 164.03s/it][A
+                                                        [A{'loss': 0.1275, 'grad_norm': 0.0021191861014813185, 'learning_rate': 1e-05, 'num_tokens': 146786245.0, 'completions/mean_length': 5349.2421875, 'completions/min_length': 678.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 5084.408203125, 'completions/min_terminated_length': 678.0, 'completions/max_terminated_length': 16233.0, 'rewards/accuracy_reward/mean': 0.4765625, 'rewards/accuracy_reward/std': 0.5014128684997559, 'reward': 0.4765625, 'reward_std': 0.2801200747489929, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.017112664878368378, 'sampling/sampling_logp_difference/max': 10.187494277954102, 'sampling/importance_sampling_ratio/min': 3.763807762879878e-05, 'sampling/importance_sampling_ratio/mean': 0.9999837875366211, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8402756005525589, 'clip_ratio/low_mean': 3.6395756637830345e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 1.9474045984679833e-06, 'clip_ratio/high_max': 7.789618393871933e-06, 'clip_ratio/region_mean': 3.834316100892465e-05, 'epoch': 0.17}
+
+ 18%|█▊        | 189/1024 [8:07:32<38:02:46, 164.03s/it][AINFO 12-02 08:21:05 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 08:21:05 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 08:21:05 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 08:21:05 [block_pool.py:292] Successfully reset prefix cache
+
+ 19%|█▊        | 190/1024 [8:10:35<39:19:39, 169.76s/it][A
+                                                        [A{'loss': 0.0079, 'grad_norm': 0.0022120666690170765, 'learning_rate': 1e-05, 'num_tokens': 147737086.0, 'completions/mean_length': 7272.3203125, 'completions/min_length': 1074.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 7053.64013671875, 'completions/min_terminated_length': 1074.0, 'completions/max_terminated_length': 15786.0, 'rewards/accuracy_reward/mean': 0.2890625, 'rewards/accuracy_reward/std': 0.45510825514793396, 'reward': 0.2890625, 'reward_std': 0.27304792404174805, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.0203307643532753, 'sampling/sampling_logp_difference/max': 10.984610557556152, 'sampling/importance_sampling_ratio/min': 1.6960719221970066e-05, 'sampling/importance_sampling_ratio/mean': 0.9999538660049438, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9627499282360077, 'clip_ratio/low_mean': 5.095924211673264e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 3.115443348633562e-06, 'clip_ratio/high_max': 1.2461773394534248e-05, 'clip_ratio/region_mean': 5.4074685294835945e-05, 'epoch': 0.17}
+
+ 19%|█▊        | 190/1024 [8:10:35<39:19:39, 169.76s/it][AINFO 12-02 08:24:08 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 08:24:08 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 08:24:08 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 08:24:08 [block_pool.py:292] Successfully reset prefix cache
+
+ 19%|█▊        | 191/1024 [8:13:32<39:45:54, 171.85s/it][A
+                                                        [A{'loss': 0.1091, 'grad_norm': 0.0030851473566144705, 'learning_rate': 1e-05, 'num_tokens': 148573782.0, 'completions/mean_length': 6387.1875, 'completions/min_length': 1310.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.046875, 'completions/mean_terminated_length': 5895.54052734375, 'completions/min_terminated_length': 1310.0, 'completions/max_terminated_length': 16212.0, 'rewards/accuracy_reward/mean': 0.40625, 'rewards/accuracy_reward/std': 0.4930621087551117, 'reward': 0.40625, 'reward_std': 0.31246688961982727, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.018049638718366623, 'sampling/sampling_logp_difference/max': 5.531248569488525, 'sampling/importance_sampling_ratio/min': 0.003961040172725916, 'sampling/importance_sampling_ratio/mean': 0.99997878074646, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9110158830881119, 'clip_ratio/low_mean': 5.616715043288423e-05, 'clip_ratio/low_min': 7.80031223257538e-06, 'clip_ratio/high_mean': 4.472931891541521e-06, 'clip_ratio/high_max': 1.7891727566166082e-05, 'clip_ratio/region_mean': 6.064008221073891e-05, 'epoch': 0.18}
+
+ 19%|█▊        | 191/1024 [8:13:32<39:45:54, 171.85s/it][AINFO 12-02 08:27:04 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 08:27:04 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 08:27:04 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 08:27:04 [block_pool.py:292] Successfully reset prefix cache
+
+ 19%|█▉        | 192/1024 [8:16:27<39:55:57, 172.79s/it][A
+                                                        [A{'loss': 0.0462, 'grad_norm': 0.0020694085396826267, 'learning_rate': 1e-05, 'num_tokens': 149521258.0, 'completions/mean_length': 7267.59375, 'completions/min_length': 653.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 7195.81103515625, 'completions/min_terminated_length': 653.0, 'completions/max_terminated_length': 16176.0, 'rewards/accuracy_reward/mean': 0.2734375, 'rewards/accuracy_reward/std': 0.447474867105484, 'reward': 0.2734375, 'reward_std': 0.29719972610473633, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.01898832805454731, 'sampling/sampling_logp_difference/max': 11.812461853027344, 'sampling/importance_sampling_ratio/min': 7.411616934405174e-06, 'sampling/importance_sampling_ratio/mean': 0.9999054670333862, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9254888147115707, 'clip_ratio/low_mean': 3.274822392995702e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 5.400205964178895e-06, 'clip_ratio/high_max': 1.6994396901282016e-05, 'clip_ratio/region_mean': 3.8148429439388565e-05, 'epoch': 0.18}
+
+ 19%|█▉        | 192/1024 [8:16:27<39:55:57, 172.79s/it][AINFO 12-02 08:30:02 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 08:30:02 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 08:30:02 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 08:30:02 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 19%|█▉        | 193/1024 [8:19:14<39:29:40, 171.10s/it][A
+                                                        [A{'loss': 0.0108, 'grad_norm': 0.003085972974076867, 'learning_rate': 1e-05, 'num_tokens': 150447923.0, 'completions/mean_length': 7100.1953125, 'completions/min_length': 560.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 6952.83349609375, 'completions/min_terminated_length': 560.0, 'completions/max_terminated_length': 16076.0, 'rewards/accuracy_reward/mean': 0.25, 'rewards/accuracy_reward/std': 0.434714138507843, 'reward': 0.25, 'reward_std': 0.23645778000354767, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.01974140852689743, 'sampling/sampling_logp_difference/max': 6.749999046325684, 'sampling/importance_sampling_ratio/min': 0.0011708807433024049, 'sampling/importance_sampling_ratio/mean': 0.9999178647994995, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8455610796809196, 'clip_ratio/low_mean': 4.7441100377909606e-05, 'clip_ratio/low_min': 4.552241534838686e-06, 'clip_ratio/high_mean': 1.026036670737085e-06, 'clip_ratio/high_max': 4.10414668294834e-06, 'clip_ratio/region_mean': 4.8467136821273016e-05, 'epoch': 0.18}
+
+ 19%|█▉        | 193/1024 [8:19:14<39:29:40, 171.10s/it][AINFO 12-02 08:32:47 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 08:32:47 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 08:32:47 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 08:32:47 [block_pool.py:292] Successfully reset prefix cache
+
+ 19%|█▉        | 194/1024 [8:21:41<37:49:09, 164.04s/it][A
+                                                        [A{'loss': 0.0885, 'grad_norm': 0.003505800850689411, 'learning_rate': 1e-05, 'num_tokens': 151313834.0, 'completions/mean_length': 6626.4296875, 'completions/min_length': 1746.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 6549.5986328125, 'completions/min_terminated_length': 1746.0, 'completions/max_terminated_length': 15535.0, 'rewards/accuracy_reward/mean': 0.390625, 'rewards/accuracy_reward/std': 0.4898075461387634, 'reward': 0.390625, 'reward_std': 0.17176413536071777, 'frac_reward_zero_std': 0.5625, 'sampling/sampling_logp_difference/mean': 0.021082937717437744, 'sampling/sampling_logp_difference/max': 10.479642868041992, 'sampling/importance_sampling_ratio/min': 2.8102756914449856e-05, 'sampling/importance_sampling_ratio/mean': 0.9999381303787231, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.0323699787259102, 'clip_ratio/low_mean': 1.6514521121280268e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 0.0, 'clip_ratio/high_max': 0.0, 'clip_ratio/region_mean': 1.6514521121280268e-05, 'epoch': 0.18}
+
+ 19%|█▉        | 194/1024 [8:21:41<37:49:09, 164.04s/it][AINFO 12-02 08:35:14 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 08:35:14 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 08:35:14 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 08:35:14 [block_pool.py:292] Successfully reset prefix cache
+
+ 19%|█▉        | 195/1024 [8:24:48<39:20:35, 170.85s/it][A
+                                                        [A{'loss': 0.0308, 'grad_norm': 0.002495395252481103, 'learning_rate': 1e-05, 'num_tokens': 152238192.0, 'completions/mean_length': 7093.109375, 'completions/min_length': 69.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 6870.12841796875, 'completions/min_terminated_length': 69.0, 'completions/max_terminated_length': 15595.0, 'rewards/accuracy_reward/mean': 0.2890625, 'rewards/accuracy_reward/std': 0.45510825514793396, 'reward': 0.2890625, 'reward_std': 0.31800350546836853, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.020610272884368896, 'sampling/sampling_logp_difference/max': 9.257795333862305, 'sampling/importance_sampling_ratio/min': 9.536534344078973e-05, 'sampling/importance_sampling_ratio/mean': 0.9999728798866272, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.0206764563918114, 'clip_ratio/low_mean': 3.503898199141986e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 5.890002398700744e-06, 'clip_ratio/high_max': 2.006086378969485e-05, 'clip_ratio/region_mean': 4.092898473118112e-05, 'epoch': 0.18}
+
+ 19%|█▉        | 195/1024 [8:24:48<39:20:35, 170.85s/it][AINFO 12-02 08:38:21 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 08:38:21 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 08:38:21 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 08:38:21 [block_pool.py:292] Successfully reset prefix cache
+
+ 19%|█▉        | 196/1024 [8:27:36<39:07:26, 170.10s/it][A
+                                                        [A{'loss': -0.0031, 'grad_norm': 0.0014066790463402867, 'learning_rate': 1e-05, 'num_tokens': 153131828.0, 'completions/mean_length': 6795.71875, 'completions/min_length': 424.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 6486.4189453125, 'completions/min_terminated_length': 424.0, 'completions/max_terminated_length': 14191.0, 'rewards/accuracy_reward/mean': 0.3359375, 'rewards/accuracy_reward/std': 0.47417303919792175, 'reward': 0.3359375, 'reward_std': 0.2409384548664093, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.01874586008489132, 'sampling/sampling_logp_difference/max': 12.187495231628418, 'sampling/importance_sampling_ratio/min': 5.093755135021638e-06, 'sampling/importance_sampling_ratio/mean': 0.9998855590820312, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8927837759256363, 'clip_ratio/low_mean': 4.056704699451075e-05, 'clip_ratio/low_min': 1.1648833606159315e-05, 'clip_ratio/high_mean': 8.088102276815334e-07, 'clip_ratio/high_max': 3.2352409107261337e-06, 'clip_ratio/region_mean': 4.1375856994818605e-05, 'epoch': 0.18}
+
+ 19%|█▉        | 196/1024 [8:27:36<39:07:26, 170.10s/it][AINFO 12-02 08:41:09 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 08:41:09 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 08:41:09 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 08:41:09 [block_pool.py:292] Successfully reset prefix cache
+
+ 19%|█▉        | 197/1024 [8:30:41<40:05:08, 174.50s/it][A
+                                                        [A{'loss': 0.0359, 'grad_norm': 0.0017937121447175741, 'learning_rate': 1e-05, 'num_tokens': 154057097.0, 'completions/mean_length': 7075.1015625, 'completions/min_length': 813.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.046875, 'completions/mean_terminated_length': 6617.28662109375, 'completions/min_terminated_length': 813.0, 'completions/max_terminated_length': 15831.0, 'rewards/accuracy_reward/mean': 0.3984375, 'rewards/accuracy_reward/std': 0.4915000796318054, 'reward': 0.3984375, 'reward_std': 0.23068872094154358, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.01890135183930397, 'sampling/sampling_logp_difference/max': 8.437480926513672, 'sampling/importance_sampling_ratio/min': 0.00021659507183358073, 'sampling/importance_sampling_ratio/mean': 0.9998950958251953, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8989318311214447, 'clip_ratio/low_mean': 3.655197178886738e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 3.8111575122456998e-06, 'clip_ratio/high_max': 1.5244630048982799e-05, 'clip_ratio/region_mean': 4.03631290737394e-05, 'epoch': 0.18}
+
+ 19%|█▉        | 197/1024 [8:30:41<40:05:08, 174.50s/it][AINFO 12-02 08:44:14 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 08:44:14 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 08:44:14 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 08:44:14 [block_pool.py:292] Successfully reset prefix cache
+[OpenTinker] 2025-12-02 08:46:01,677 - math_verify.grader - WARNING - Timeout during comparison
+[OpenTinker] 2025-12-02 08:46:06,689 - math_verify.grader - WARNING - Timeout during comparison
+
+ 19%|█▉        | 198/1024 [8:33:38<40:12:53, 175.27s/it][A
+                                                        [A{'loss': 0.0536, 'grad_norm': 0.003410576842725277, 'learning_rate': 1e-05, 'num_tokens': 154988585.0, 'completions/mean_length': 7120.0, 'completions/min_length': 78.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0390625, 'completions/mean_terminated_length': 6743.41455078125, 'completions/min_terminated_length': 78.0, 'completions/max_terminated_length': 14100.0, 'rewards/accuracy_reward/mean': 0.3984375, 'rewards/accuracy_reward/std': 0.4915000796318054, 'reward': 0.3984375, 'reward_std': 0.2845958471298218, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.018400676548480988, 'sampling/sampling_logp_difference/max': 5.629853248596191, 'sampling/importance_sampling_ratio/min': 0.003589102067053318, 'sampling/importance_sampling_ratio/mean': 0.9999953508377075, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8758384585380554, 'clip_ratio/low_mean': 3.2670792506905855e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 4.977033995601232e-06, 'clip_ratio/high_max': 1.4074375030759256e-05, 'clip_ratio/region_mean': 3.764782627513341e-05, 'epoch': 0.18}
+
+ 19%|█▉        | 198/1024 [8:33:38<40:12:53, 175.27s/it][AINFO 12-02 08:47:11 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 08:47:11 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 08:47:11 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 08:47:11 [block_pool.py:292] Successfully reset prefix cache
+
+ 19%|█▉        | 199/1024 [8:36:18<39:06:44, 170.67s/it][A
+                                                        [A{'loss': 0.0585, 'grad_norm': 0.003304310142993927, 'learning_rate': 1e-05, 'num_tokens': 155851000.0, 'completions/mean_length': 6590.6796875, 'completions/min_length': 23.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 6513.56689453125, 'completions/min_terminated_length': 23.0, 'completions/max_terminated_length': 15821.0, 'rewards/accuracy_reward/mean': 0.3984375, 'rewards/accuracy_reward/std': 0.4915000796318054, 'reward': 0.3984375, 'reward_std': 0.29036980867385864, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.01959652081131935, 'sampling/sampling_logp_difference/max': 13.576997756958008, 'sampling/importance_sampling_ratio/min': 1.2693599273916334e-06, 'sampling/importance_sampling_ratio/mean': 0.9999579787254333, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9243742749094963, 'clip_ratio/low_mean': 1.977112736994968e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 0.0, 'clip_ratio/high_max': 0.0, 'clip_ratio/region_mean': 1.977112736994968e-05, 'epoch': 0.18}
+
+ 19%|█▉        | 199/1024 [8:36:18<39:06:44, 170.67s/it][AINFO 12-02 08:49:51 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 08:49:51 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 08:49:51 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 08:49:51 [block_pool.py:292] Successfully reset prefix cache
+
+ 20%|█▉        | 200/1024 [8:39:16<39:33:20, 172.82s/it][A
+                                                        [A{'loss': 0.1142, 'grad_norm': 0.0023830258287489414, 'learning_rate': 1e-05, 'num_tokens': 156766782.0, 'completions/mean_length': 6999.484375, 'completions/min_length': 445.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 6696.7578125, 'completions/min_terminated_length': 445.0, 'completions/max_terminated_length': 16115.0, 'rewards/accuracy_reward/mean': 0.359375, 'rewards/accuracy_reward/std': 0.481702595949173, 'reward': 0.359375, 'reward_std': 0.2885475754737854, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.018434934318065643, 'sampling/sampling_logp_difference/max': 8.820915222167969, 'sampling/importance_sampling_ratio/min': 0.00014761318743694574, 'sampling/importance_sampling_ratio/mean': 0.9998635053634644, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.843244343996048, 'clip_ratio/low_mean': 4.7742656533955596e-05, 'clip_ratio/low_min': 8.646529749967158e-06, 'clip_ratio/high_mean': 2.858841753550223e-06, 'clip_ratio/high_max': 1.1435367014200892e-05, 'clip_ratio/region_mean': 5.0601498060132144e-05, 'epoch': 0.18}
+
+ 20%|█▉        | 200/1024 [8:39:16<39:33:20, 172.82s/it][AINFO 12-02 08:52:49 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 08:52:49 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 08:52:49 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 08:52:49 [block_pool.py:292] Successfully reset prefix cache
+
+ 20%|█▉        | 201/1024 [8:41:50<38:14:27, 167.28s/it][A
+                                                        [A{'loss': 0.0795, 'grad_norm': 0.0024427250027656555, 'learning_rate': 1e-05, 'num_tokens': 157606126.0, 'completions/mean_length': 6407.5, 'completions/min_length': 351.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 6249.14306640625, 'completions/min_terminated_length': 351.0, 'completions/max_terminated_length': 14986.0, 'rewards/accuracy_reward/mean': 0.3515625, 'rewards/accuracy_reward/std': 0.4793342351913452, 'reward': 0.3515625, 'reward_std': 0.32879000902175903, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.0192743968218565, 'sampling/sampling_logp_difference/max': 8.37498950958252, 'sampling/importance_sampling_ratio/min': 0.0002305622911080718, 'sampling/importance_sampling_ratio/mean': 0.999966025352478, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9549195989966393, 'clip_ratio/low_mean': 5.3607667723554187e-05, 'clip_ratio/low_min': 9.219345429301029e-06, 'clip_ratio/high_mean': 7.040741365926806e-06, 'clip_ratio/high_max': 2.5114631171163637e-05, 'clip_ratio/region_mean': 6.064840863473364e-05, 'epoch': 0.18}
+
+ 20%|█▉        | 201/1024 [8:41:50<38:14:27, 167.28s/it][AINFO 12-02 08:55:23 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 08:55:23 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 08:55:23 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 08:55:23 [block_pool.py:292] Successfully reset prefix cache
+
+ 20%|█▉        | 202/1024 [8:44:44<38:37:25, 169.16s/it][A
+                                                        [A{'loss': 0.0604, 'grad_norm': 0.002024515997618437, 'learning_rate': 1e-05, 'num_tokens': 158474248.0, 'completions/mean_length': 6638.390625, 'completions/min_length': 56.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0703125, 'completions/mean_terminated_length': 5901.328125, 'completions/min_terminated_length': 56.0, 'completions/max_terminated_length': 15519.0, 'rewards/accuracy_reward/mean': 0.4140625, 'rewards/accuracy_reward/std': 0.49449479579925537, 'reward': 0.4140625, 'reward_std': 0.28117600083351135, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.01955476775765419, 'sampling/sampling_logp_difference/max': 5.624914169311523, 'sampling/importance_sampling_ratio/min': 0.0036068728659301996, 'sampling/importance_sampling_ratio/mean': 0.9999830722808838, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9070822075009346, 'clip_ratio/low_mean': 2.928529067958152e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 0.0, 'clip_ratio/high_max': 0.0, 'clip_ratio/region_mean': 2.928529067958152e-05, 'epoch': 0.19}
+
+ 20%|█▉        | 202/1024 [8:44:44<38:37:25, 169.16s/it][AINFO 12-02 08:58:17 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 08:58:17 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 08:58:17 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 08:58:17 [block_pool.py:292] Successfully reset prefix cache
+
+ 20%|█▉        | 203/1024 [8:47:38<38:54:31, 170.61s/it][A
+                                                        [A{'loss': 0.0616, 'grad_norm': 0.0022787705529481173, 'learning_rate': 1e-05, 'num_tokens': 159434350.0, 'completions/mean_length': 7324.546875, 'completions/min_length': 571.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.046875, 'completions/mean_terminated_length': 6878.99951171875, 'completions/min_terminated_length': 571.0, 'completions/max_terminated_length': 15708.0, 'rewards/accuracy_reward/mean': 0.3359375, 'rewards/accuracy_reward/std': 0.47417303919792175, 'reward': 0.3359375, 'reward_std': 0.26515230536460876, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.019122496247291565, 'sampling/sampling_logp_difference/max': 3.231938362121582, 'sampling/importance_sampling_ratio/min': 0.03948089852929115, 'sampling/importance_sampling_ratio/mean': 0.9999351501464844, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9108889549970627, 'clip_ratio/low_mean': 4.1470637825113954e-05, 'clip_ratio/low_min': 4.027710474474588e-06, 'clip_ratio/high_mean': 2.091293367811886e-06, 'clip_ratio/high_max': 8.365173471247545e-06, 'clip_ratio/region_mean': 4.356193130661268e-05, 'epoch': 0.19}
+
+ 20%|█▉        | 203/1024 [8:47:38<38:54:31, 170.61s/it][AINFO 12-02 09:01:11 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 09:01:11 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 09:01:11 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 09:01:11 [block_pool.py:292] Successfully reset prefix cache
+
+ 20%|█▉        | 204/1024 [8:50:21<38:19:49, 168.28s/it][A
+                                                        [A{'loss': 0.049, 'grad_norm': 0.0026195270475000143, 'learning_rate': 1e-05, 'num_tokens': 160163055.0, 'completions/mean_length': 5520.4453125, 'completions/min_length': 73.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 5434.9052734375, 'completions/min_terminated_length': 73.0, 'completions/max_terminated_length': 13983.0, 'rewards/accuracy_reward/mean': 0.4375, 'rewards/accuracy_reward/std': 0.49802759289741516, 'reward': 0.4375, 'reward_std': 0.24831004440784454, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.019064132124185562, 'sampling/sampling_logp_difference/max': 7.498111724853516, 'sampling/importance_sampling_ratio/min': 0.0005541297141462564, 'sampling/importance_sampling_ratio/mean': 0.9998810291290283, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8982062339782715, 'clip_ratio/low_mean': 3.456336048657249e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 2.1643335230692173e-06, 'clip_ratio/high_max': 8.65733409227687e-06, 'clip_ratio/region_mean': 3.672769389595487e-05, 'epoch': 0.19}
+
+ 20%|█▉        | 204/1024 [8:50:21<38:19:49, 168.28s/it][AINFO 12-02 09:03:54 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 09:03:54 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 09:03:54 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 09:03:54 [block_pool.py:292] Successfully reset prefix cache
+
+ 20%|██        | 205/1024 [8:52:44<36:33:36, 160.70s/it][A
+                                                        [A{'loss': 0.006, 'grad_norm': 0.0014831912703812122, 'learning_rate': 1e-05, 'num_tokens': 161057657.0, 'completions/mean_length': 6846.515625, 'completions/min_length': 63.0, 'completions/max_length': 15267.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 6846.515625, 'completions/min_terminated_length': 63.0, 'completions/max_terminated_length': 15267.0, 'rewards/accuracy_reward/mean': 0.296875, 'rewards/accuracy_reward/std': 0.45867621898651123, 'reward': 0.296875, 'reward_std': 0.27198708057403564, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.020372584462165833, 'sampling/sampling_logp_difference/max': 9.679890632629395, 'sampling/importance_sampling_ratio/min': 6.252834282349795e-05, 'sampling/importance_sampling_ratio/mean': 0.9999252557754517, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9657742157578468, 'clip_ratio/low_mean': 4.059042771586974e-05, 'clip_ratio/low_min': 5.350111223378917e-06, 'clip_ratio/high_mean': 6.650576210631698e-06, 'clip_ratio/high_max': 1.8376186289970065e-05, 'clip_ratio/region_mean': 4.724100449493562e-05, 'epoch': 0.19}
+
+ 20%|██        | 205/1024 [8:52:44<36:33:36, 160.70s/it][AINFO 12-02 09:06:17 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 09:06:17 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 09:06:17 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 09:06:17 [block_pool.py:292] Successfully reset prefix cache
+
+ 20%|██        | 206/1024 [8:55:40<37:34:48, 165.39s/it][A
+                                                        [A{'loss': 0.0696, 'grad_norm': 0.0017962189158424735, 'learning_rate': 1e-05, 'num_tokens': 161966356.0, 'completions/mean_length': 6946.8984375, 'completions/min_length': 1133.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 6642.4755859375, 'completions/min_terminated_length': 1133.0, 'completions/max_terminated_length': 16370.0, 'rewards/accuracy_reward/mean': 0.4296875, 'rewards/accuracy_reward/std': 0.4969765841960907, 'reward': 0.4296875, 'reward_std': 0.33114415407180786, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.019146796315908432, 'sampling/sampling_logp_difference/max': 9.561946868896484, 'sampling/importance_sampling_ratio/min': 7.035569433355704e-05, 'sampling/importance_sampling_ratio/mean': 0.9999545216560364, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8490508273243904, 'clip_ratio/low_mean': 4.473214539757464e-05, 'clip_ratio/low_min': 2.9674999950657366e-06, 'clip_ratio/high_mean': 4.14725354858092e-06, 'clip_ratio/high_max': 1.658901419432368e-05, 'clip_ratio/region_mean': 4.887939894615556e-05, 'epoch': 0.19}
+
+ 20%|██        | 206/1024 [8:55:40<37:34:48, 165.39s/it][AINFO 12-02 09:09:13 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 09:09:13 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 09:09:13 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 09:09:13 [block_pool.py:292] Successfully reset prefix cache
+
+ 20%|██        | 207/1024 [8:58:16<36:53:08, 162.53s/it][A
+                                                        [A{'loss': 0.0509, 'grad_norm': 0.0017619321588426828, 'learning_rate': 1e-05, 'num_tokens': 162836705.0, 'completions/mean_length': 6618.9765625, 'completions/min_length': 529.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 6463.9765625, 'completions/min_terminated_length': 529.0, 'completions/max_terminated_length': 15123.0, 'rewards/accuracy_reward/mean': 0.390625, 'rewards/accuracy_reward/std': 0.4898075461387634, 'reward': 0.390625, 'reward_std': 0.2130674123764038, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.020236656069755554, 'sampling/sampling_logp_difference/max': 14.680485725402832, 'sampling/importance_sampling_ratio/min': 4.2106199771296815e-07, 'sampling/importance_sampling_ratio/mean': 0.9999436140060425, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9541772454977036, 'clip_ratio/low_mean': 3.22491199540309e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 0.0, 'clip_ratio/high_max': 0.0, 'clip_ratio/region_mean': 3.22491199540309e-05, 'epoch': 0.19}
+
+ 20%|██        | 207/1024 [8:58:16<36:53:08, 162.53s/it][AINFO 12-02 09:11:49 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 09:11:49 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 09:11:49 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 09:11:49 [block_pool.py:292] Successfully reset prefix cache
+
+ 20%|██        | 208/1024 [9:00:47<36:02:12, 158.99s/it][A
+                                                        [A{'loss': 0.0709, 'grad_norm': 0.002670915797352791, 'learning_rate': 1e-05, 'num_tokens': 163558197.0, 'completions/mean_length': 5485.71875, 'completions/min_length': 104.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 5312.73046875, 'completions/min_terminated_length': 104.0, 'completions/max_terminated_length': 12469.0, 'rewards/accuracy_reward/mean': 0.46875, 'rewards/accuracy_reward/std': 0.5009832978248596, 'reward': 0.46875, 'reward_std': 0.3145885467529297, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.019581373780965805, 'sampling/sampling_logp_difference/max': 7.592487812042236, 'sampling/importance_sampling_ratio/min': 0.0005042250850237906, 'sampling/importance_sampling_ratio/mean': 1.0000442266464233, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8888534903526306, 'clip_ratio/low_mean': 3.877300162002939e-05, 'clip_ratio/low_min': 4.230834292684449e-06, 'clip_ratio/high_mean': 4.211513555674173e-06, 'clip_ratio/high_max': 1.6846054222696694e-05, 'clip_ratio/region_mean': 4.298451551676408e-05, 'epoch': 0.19}
+
+ 20%|██        | 208/1024 [9:00:47<36:02:12, 158.99s/it][AINFO 12-02 09:14:20 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 09:14:20 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 09:14:20 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 09:14:20 [block_pool.py:292] Successfully reset prefix cache
+
+ 20%|██        | 209/1024 [9:03:08<34:49:34, 153.83s/it][A
+                                                        [A{'loss': 0.1072, 'grad_norm': 0.004005427472293377, 'learning_rate': 1e-05, 'num_tokens': 164133499.0, 'completions/mean_length': 4345.171875, 'completions/min_length': 68.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 4250.3779296875, 'completions/min_terminated_length': 68.0, 'completions/max_terminated_length': 16184.0, 'rewards/accuracy_reward/mean': 0.578125, 'rewards/accuracy_reward/std': 0.4957992732524872, 'reward': 0.578125, 'reward_std': 0.31642353534698486, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.017508968710899353, 'sampling/sampling_logp_difference/max': 3.773045301437378, 'sampling/importance_sampling_ratio/min': 0.022981969639658928, 'sampling/importance_sampling_ratio/mean': 0.9999247193336487, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8308270424604416, 'clip_ratio/low_mean': 3.6889288480779214e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 0.0, 'clip_ratio/high_max': 0.0, 'clip_ratio/region_mean': 3.6889288480779214e-05, 'epoch': 0.19}
+
+ 20%|██        | 209/1024 [9:03:08<34:49:34, 153.83s/it][AINFO 12-02 09:16:41 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 09:16:41 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 09:16:41 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 09:16:41 [block_pool.py:292] Successfully reset prefix cache
+
+ 21%|██        | 210/1024 [9:05:29<33:53:25, 149.88s/it][A
+                                                        [A{'loss': 0.0179, 'grad_norm': 0.0031033784616738558, 'learning_rate': 1e-05, 'num_tokens': 164823681.0, 'completions/mean_length': 5227.296875, 'completions/min_length': 5.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 5050.20654296875, 'completions/min_terminated_length': 5.0, 'completions/max_terminated_length': 15509.0, 'rewards/accuracy_reward/mean': 0.4765625, 'rewards/accuracy_reward/std': 0.5014128684997559, 'reward': 0.4765625, 'reward_std': 0.29249146580696106, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.019171088933944702, 'sampling/sampling_logp_difference/max': 6.149660587310791, 'sampling/importance_sampling_ratio/min': 0.0021342060063034296, 'sampling/importance_sampling_ratio/mean': 0.999896764755249, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9231975972652435, 'clip_ratio/low_mean': 2.723402121773688e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 3.2492790751348366e-06, 'clip_ratio/high_max': 1.2997116300539346e-05, 'clip_ratio/region_mean': 3.0483300406558556e-05, 'epoch': 0.19}
+
+ 21%|██        | 210/1024 [9:05:29<33:53:25, 149.88s/it][AINFO 12-02 09:19:02 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 09:19:02 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 09:19:02 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 09:19:02 [block_pool.py:292] Successfully reset prefix cache
+
+ 21%|██        | 211/1024 [9:08:09<34:29:43, 152.75s/it][A
+                                                        [A{'loss': 0.0785, 'grad_norm': 0.003495733719319105, 'learning_rate': 1e-05, 'num_tokens': 165668798.0, 'completions/mean_length': 6473.4765625, 'completions/min_length': 726.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 6316.1669921875, 'completions/min_terminated_length': 726.0, 'completions/max_terminated_length': 14160.0, 'rewards/accuracy_reward/mean': 0.4765625, 'rewards/accuracy_reward/std': 0.5014128684997559, 'reward': 0.4765625, 'reward_std': 0.3469353914260864, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.018978482112288475, 'sampling/sampling_logp_difference/max': 7.663229465484619, 'sampling/importance_sampling_ratio/min': 0.0004697878030128777, 'sampling/importance_sampling_ratio/mean': 1.0000354051589966, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9061874598264694, 'clip_ratio/low_mean': 2.9314877565411734e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 5.208972652326338e-06, 'clip_ratio/high_max': 2.0835890609305352e-05, 'clip_ratio/region_mean': 3.452385044511175e-05, 'epoch': 0.19}
+
+ 21%|██        | 211/1024 [9:08:09<34:29:43, 152.75s/it][AINFO 12-02 09:21:41 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 09:21:41 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 09:21:41 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 09:21:41 [block_pool.py:292] Successfully reset prefix cache
+
+ 21%|██        | 212/1024 [9:11:08<36:17:39, 160.91s/it][A
+                                                        [A{'loss': 0.027, 'grad_norm': 0.002381941769272089, 'learning_rate': 1e-05, 'num_tokens': 166603375.0, 'completions/mean_length': 7140.1953125, 'completions/min_length': 141.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0546875, 'completions/mean_terminated_length': 6605.4296875, 'completions/min_terminated_length': 141.0, 'completions/max_terminated_length': 14659.0, 'rewards/accuracy_reward/mean': 0.3046875, 'rewards/accuracy_reward/std': 0.46208351850509644, 'reward': 0.3046875, 'reward_std': 0.27776598930358887, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.021141134202480316, 'sampling/sampling_logp_difference/max': 7.748849868774414, 'sampling/importance_sampling_ratio/min': 0.00043123820796608925, 'sampling/importance_sampling_ratio/mean': 0.999864935874939, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9605444446206093, 'clip_ratio/low_mean': 3.991967162164656e-05, 'clip_ratio/low_min': 6.304534053924726e-06, 'clip_ratio/high_mean': 0.0, 'clip_ratio/high_max': 0.0, 'clip_ratio/region_mean': 3.991967162164656e-05, 'epoch': 0.2}
+
+ 21%|██        | 212/1024 [9:11:08<36:17:39, 160.91s/it][AINFO 12-02 09:24:41 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 09:24:41 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 09:24:41 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 09:24:41 [block_pool.py:292] Successfully reset prefix cache
+
+ 21%|██        | 213/1024 [9:13:36<35:19:12, 156.78s/it][A
+                                                        [A{'loss': 0.0534, 'grad_norm': 0.0040566748939454556, 'learning_rate': 1e-05, 'num_tokens': 167302275.0, 'completions/mean_length': 5304.46875, 'completions/min_length': 342.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 5038.56005859375, 'completions/min_terminated_length': 342.0, 'completions/max_terminated_length': 13770.0, 'rewards/accuracy_reward/mean': 0.4296875, 'rewards/accuracy_reward/std': 0.4969765841960907, 'reward': 0.4296875, 'reward_std': 0.33114415407180786, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.018822530284523964, 'sampling/sampling_logp_difference/max': 14.508296012878418, 'sampling/importance_sampling_ratio/min': 5.001809313398553e-07, 'sampling/importance_sampling_ratio/mean': 0.9999827742576599, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9176690131425858, 'clip_ratio/low_mean': 3.4953729482367635e-05, 'clip_ratio/low_min': 3.991060111729894e-06, 'clip_ratio/high_mean': 3.7371441976574715e-06, 'clip_ratio/high_max': 1.4948576790629886e-05, 'clip_ratio/region_mean': 3.869087413477246e-05, 'epoch': 0.2}
+
+ 21%|██        | 213/1024 [9:13:36<35:19:12, 156.78s/it][AINFO 12-02 09:27:09 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 09:27:09 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 09:27:09 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 09:27:09 [block_pool.py:292] Successfully reset prefix cache
+
+ 21%|██        | 214/1024 [9:16:05<34:47:08, 154.60s/it][A
+                                                        [A{'loss': 0.0737, 'grad_norm': 0.0021502040326595306, 'learning_rate': 1e-05, 'num_tokens': 168063627.0, 'completions/mean_length': 5796.5, 'completions/min_length': 407.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 5542.400390625, 'completions/min_terminated_length': 407.0, 'completions/max_terminated_length': 15791.0, 'rewards/accuracy_reward/mean': 0.3828125, 'rewards/accuracy_reward/std': 0.4879830479621887, 'reward': 0.3828125, 'reward_std': 0.3158818483352661, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.01985779032111168, 'sampling/sampling_logp_difference/max': 4.655994892120361, 'sampling/importance_sampling_ratio/min': 0.009504453279078007, 'sampling/importance_sampling_ratio/mean': 0.9999223351478577, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9230027198791504, 'clip_ratio/low_mean': 2.653866999935417e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 0.0, 'clip_ratio/high_max': 0.0, 'clip_ratio/region_mean': 2.653866999935417e-05, 'epoch': 0.2}
+
+ 21%|██        | 214/1024 [9:16:05<34:47:08, 154.60s/it][AINFO 12-02 09:29:38 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 09:29:38 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 09:29:38 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 09:29:38 [block_pool.py:292] Successfully reset prefix cache
+
+ 21%|██        | 215/1024 [9:18:34<34:22:57, 153.00s/it][A
+                                                        [A{'loss': 0.0139, 'grad_norm': 0.0027786416467279196, 'learning_rate': 1e-05, 'num_tokens': 168867858.0, 'completions/mean_length': 6135.4921875, 'completions/min_length': 1259.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 6054.79541015625, 'completions/min_terminated_length': 1259.0, 'completions/max_terminated_length': 14814.0, 'rewards/accuracy_reward/mean': 0.4609375, 'rewards/accuracy_reward/std': 0.5004304051399231, 'reward': 0.4609375, 'reward_std': 0.3366856575012207, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.018514130264520645, 'sampling/sampling_logp_difference/max': 10.553963661193848, 'sampling/importance_sampling_ratio/min': 2.6089865059475414e-05, 'sampling/importance_sampling_ratio/mean': 0.9999550580978394, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.869445689022541, 'clip_ratio/low_mean': 2.4175752741939505e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 2.7159603632753715e-06, 'clip_ratio/high_max': 1.0863841453101486e-05, 'clip_ratio/region_mean': 2.6891713218901714e-05, 'epoch': 0.2}
+
+ 21%|██        | 215/1024 [9:18:34<34:22:57, 153.00s/it][AINFO 12-02 09:32:07 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 09:32:07 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 09:32:07 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 09:32:07 [block_pool.py:292] Successfully reset prefix cache
+
+ 21%|██        | 216/1024 [9:21:44<36:49:23, 164.06s/it][A
+                                                        [A{'loss': 0.0188, 'grad_norm': 0.0007328780484385788, 'learning_rate': 1e-05, 'num_tokens': 169689969.0, 'completions/mean_length': 6268.2421875, 'completions/min_length': 627.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 6025.46435546875, 'completions/min_terminated_length': 627.0, 'completions/max_terminated_length': 15691.0, 'rewards/accuracy_reward/mean': 0.3828125, 'rewards/accuracy_reward/std': 0.4879830479621887, 'reward': 0.3828125, 'reward_std': 0.10994865000247955, 'frac_reward_zero_std': 0.75, 'sampling/sampling_logp_difference/mean': 0.02005261555314064, 'sampling/sampling_logp_difference/max': 11.003040313720703, 'sampling/importance_sampling_ratio/min': 1.6650999896228313e-05, 'sampling/importance_sampling_ratio/mean': 1.0000133514404297, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.951081782579422, 'clip_ratio/low_mean': 1.993327998661698e-06, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 1.0919700343947625e-06, 'clip_ratio/high_max': 4.36788013757905e-06, 'clip_ratio/region_mean': 3.0852980330564606e-06, 'epoch': 0.2}
+
+ 21%|██        | 216/1024 [9:21:44<36:49:23, 164.06s/it][AINFO 12-02 09:35:17 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 09:35:17 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 09:35:17 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 09:35:17 [block_pool.py:292] Successfully reset prefix cache
+
+ 21%|██        | 217/1024 [9:24:15<35:52:46, 160.06s/it][A
+                                                        [A{'loss': -0.0245, 'grad_norm': 0.0034721922129392624, 'learning_rate': 1e-05, 'num_tokens': 170409292.0, 'completions/mean_length': 5440.8984375, 'completions/min_length': 413.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 5354.732421875, 'completions/min_terminated_length': 413.0, 'completions/max_terminated_length': 13861.0, 'rewards/accuracy_reward/mean': 0.53125, 'rewards/accuracy_reward/std': 0.5009832978248596, 'reward': 0.53125, 'reward_std': 0.30327308177948, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.019136395305395126, 'sampling/sampling_logp_difference/max': 10.904656410217285, 'sampling/importance_sampling_ratio/min': 1.8372484191786498e-05, 'sampling/importance_sampling_ratio/mean': 0.9998912811279297, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8271932750940323, 'clip_ratio/low_mean': 3.5254403428552905e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 5.3283407055459975e-06, 'clip_ratio/high_max': 2.131336282218399e-05, 'clip_ratio/region_mean': 4.058274430462916e-05, 'epoch': 0.2}
+
+ 21%|██        | 217/1024 [9:24:15<35:52:46, 160.06s/it][AINFO 12-02 09:37:48 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 09:37:48 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 09:37:48 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 09:37:48 [block_pool.py:292] Successfully reset prefix cache
+
+ 21%|██▏       | 218/1024 [9:27:02<36:18:01, 162.14s/it][A
+                                                        [A{'loss': 0.0286, 'grad_norm': 0.0027575206477195024, 'learning_rate': 1e-05, 'num_tokens': 171280714.0, 'completions/mean_length': 6671.046875, 'completions/min_length': 748.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 6594.56689453125, 'completions/min_terminated_length': 748.0, 'completions/max_terminated_length': 15086.0, 'rewards/accuracy_reward/mean': 0.375, 'rewards/accuracy_reward/std': 0.4860251843929291, 'reward': 0.375, 'reward_std': 0.2109457552433014, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.019089506939053535, 'sampling/sampling_logp_difference/max': 11.06179428100586, 'sampling/importance_sampling_ratio/min': 1.5700872609158978e-05, 'sampling/importance_sampling_ratio/mean': 0.9999411702156067, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9659745842218399, 'clip_ratio/low_mean': 2.156280152121326e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 4.13687178024702e-06, 'clip_ratio/high_max': 1.2339016848272877e-05, 'clip_ratio/region_mean': 2.569967330146028e-05, 'epoch': 0.2}
+
+ 21%|██▏       | 218/1024 [9:27:02<36:18:01, 162.14s/it][AINFO 12-02 09:40:35 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 09:40:35 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 09:40:35 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 09:40:35 [block_pool.py:292] Successfully reset prefix cache
+
+ 21%|██▏       | 219/1024 [9:30:02<37:28:23, 167.58s/it][A
+                                                        [A{'loss': 0.0115, 'grad_norm': 0.0012711051385849714, 'learning_rate': 1e-05, 'num_tokens': 172302489.0, 'completions/mean_length': 7781.5546875, 'completions/min_length': 429.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 7504.05615234375, 'completions/min_terminated_length': 429.0, 'completions/max_terminated_length': 15752.0, 'rewards/accuracy_reward/mean': 0.109375, 'rewards/accuracy_reward/std': 0.31333550810813904, 'reward': 0.109375, 'reward_std': 0.1751839816570282, 'frac_reward_zero_std': 0.5625, 'sampling/sampling_logp_difference/mean': 0.023309212177991867, 'sampling/sampling_logp_difference/max': 5.281247615814209, 'sampling/importance_sampling_ratio/min': 0.005086081102490425, 'sampling/importance_sampling_ratio/mean': 0.9998820424079895, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.1691131889820099, 'clip_ratio/low_mean': 3.2977761520669446e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 3.650864528026432e-06, 'clip_ratio/high_max': 1.4603458112105727e-05, 'clip_ratio/region_mean': 3.662862599185246e-05, 'epoch': 0.2}
+
+ 21%|██▏       | 219/1024 [9:30:02<37:28:23, 167.58s/it][AINFO 12-02 09:43:35 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 09:43:35 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 09:43:35 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 09:43:35 [block_pool.py:292] Successfully reset prefix cache
+
+ 21%|██▏       | 220/1024 [9:32:49<37:20:16, 167.18s/it][A
+                                                        [A{'loss': 0.0156, 'grad_norm': 0.0016933141741901636, 'learning_rate': 1e-05, 'num_tokens': 173149653.0, 'completions/mean_length': 6489.96875, 'completions/min_length': 556.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 6332.9208984375, 'completions/min_terminated_length': 556.0, 'completions/max_terminated_length': 14891.0, 'rewards/accuracy_reward/mean': 0.484375, 'rewards/accuracy_reward/std': 0.5017194747924805, 'reward': 0.484375, 'reward_std': 0.32325342297554016, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.019165027886629105, 'sampling/sampling_logp_difference/max': 4.7106852531433105, 'sampling/importance_sampling_ratio/min': 0.008998609147965908, 'sampling/importance_sampling_ratio/mean': 0.9999572038650513, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9354017227888107, 'clip_ratio/low_mean': 4.5269940528669395e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 1.710521871700621e-06, 'clip_ratio/high_max': 6.842087486802484e-06, 'clip_ratio/region_mean': 4.6980462457213434e-05, 'epoch': 0.2}
+
+ 21%|██▏       | 220/1024 [9:32:49<37:20:16, 167.18s/it][AINFO 12-02 09:46:21 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 09:46:21 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 09:46:21 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 09:46:21 [block_pool.py:292] Successfully reset prefix cache
+
+ 22%|██▏       | 221/1024 [9:35:16<35:56:34, 161.14s/it][A
+                                                        [A{'loss': 0.0552, 'grad_norm': 0.0024940327275544405, 'learning_rate': 1e-05, 'num_tokens': 173976797.0, 'completions/mean_length': 6309.75, 'completions/min_length': 474.0, 'completions/max_length': 14943.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 6309.75, 'completions/min_terminated_length': 474.0, 'completions/max_terminated_length': 14943.0, 'rewards/accuracy_reward/mean': 0.4375, 'rewards/accuracy_reward/std': 0.49802759289741516, 'reward': 0.4375, 'reward_std': 0.2790592610836029, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.020797956734895706, 'sampling/sampling_logp_difference/max': 6.280703544616699, 'sampling/importance_sampling_ratio/min': 0.0018720829393714666, 'sampling/importance_sampling_ratio/mean': 0.9999861121177673, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.012483686208725, 'clip_ratio/low_mean': 3.0998270403870265e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 6.111851803325408e-06, 'clip_ratio/high_max': 2.444740721330163e-05, 'clip_ratio/region_mean': 3.711012095664046e-05, 'epoch': 0.2}
+
+ 22%|██▏       | 221/1024 [9:35:16<35:56:34, 161.14s/it][AINFO 12-02 09:48:48 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 09:48:48 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 09:48:48 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 09:48:48 [block_pool.py:292] Successfully reset prefix cache
+
+ 22%|██▏       | 222/1024 [9:38:14<37:04:48, 166.44s/it][A
+                                                        [A{'loss': 0.0588, 'grad_norm': 0.002748408354818821, 'learning_rate': 1e-05, 'num_tokens': 174965259.0, 'completions/mean_length': 7574.984375, 'completions/min_length': 598.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 7363.568359375, 'completions/min_terminated_length': 598.0, 'completions/max_terminated_length': 16362.0, 'rewards/accuracy_reward/mean': 0.2734375, 'rewards/accuracy_reward/std': 0.447474867105484, 'reward': 0.2734375, 'reward_std': 0.25224411487579346, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.019229793921113014, 'sampling/sampling_logp_difference/max': 5.170575141906738, 'sampling/importance_sampling_ratio/min': 0.005681300535798073, 'sampling/importance_sampling_ratio/mean': 1.0000108480453491, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9144782647490501, 'clip_ratio/low_mean': 1.7024583712554886e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 3.5388877677178243e-06, 'clip_ratio/high_max': 1.1112337460872368e-05, 'clip_ratio/region_mean': 2.056347148027271e-05, 'epoch': 0.2}
+
+ 22%|██▏       | 222/1024 [9:38:14<37:04:48, 166.44s/it][AINFO 12-02 09:51:47 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 09:51:47 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 09:51:47 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 09:51:47 [block_pool.py:292] Successfully reset prefix cache
+
+ 22%|██▏       | 223/1024 [9:41:09<37:35:24, 168.94s/it][A
+                                                        [A{'loss': 0.0714, 'grad_norm': 0.0020343128126114607, 'learning_rate': 1e-05, 'num_tokens': 175876446.0, 'completions/mean_length': 6962.7734375, 'completions/min_length': 780.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.046875, 'completions/mean_terminated_length': 6499.43408203125, 'completions/min_terminated_length': 780.0, 'completions/max_terminated_length': 15365.0, 'rewards/accuracy_reward/mean': 0.421875, 'rewards/accuracy_reward/std': 0.4957992732524872, 'reward': 0.421875, 'reward_std': 0.3156445026397705, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.01860032044351101, 'sampling/sampling_logp_difference/max': 8.734436988830566, 'sampling/importance_sampling_ratio/min': 0.0001609467581147328, 'sampling/importance_sampling_ratio/mean': 0.9999679327011108, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9248140156269073, 'clip_ratio/low_mean': 3.722507381098694e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 3.736522501185391e-06, 'clip_ratio/high_max': 1.4946090004741563e-05, 'clip_ratio/region_mean': 4.096159636901575e-05, 'epoch': 0.21}
+
+ 22%|██▏       | 223/1024 [9:41:09<37:35:24, 168.94s/it][AINFO 12-02 09:54:42 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 09:54:42 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 09:54:42 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 09:54:42 [block_pool.py:292] Successfully reset prefix cache
+
+ 22%|██▏       | 224/1024 [9:43:42<36:27:58, 164.10s/it][A
+                                                        [A{'loss': 0.1116, 'grad_norm': 0.0029119597747921944, 'learning_rate': 1e-05, 'num_tokens': 176717226.0, 'completions/mean_length': 6422.28125, 'completions/min_length': 117.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 6264.1591796875, 'completions/min_terminated_length': 117.0, 'completions/max_terminated_length': 14978.0, 'rewards/accuracy_reward/mean': 0.578125, 'rewards/accuracy_reward/std': 0.4957992732524872, 'reward': 0.578125, 'reward_std': 0.27328526973724365, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.01786171644926071, 'sampling/sampling_logp_difference/max': 7.371761798858643, 'sampling/importance_sampling_ratio/min': 0.0006287595024332404, 'sampling/importance_sampling_ratio/mean': 0.9999918937683105, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.7786787301301956, 'clip_ratio/low_mean': 3.189400638348161e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 1.0565285037955618e-06, 'clip_ratio/high_max': 4.226114015182247e-06, 'clip_ratio/region_mean': 3.295053488727717e-05, 'epoch': 0.21}
+
+ 22%|██▏       | 224/1024 [9:43:42<36:27:58, 164.10s/it][AINFO 12-02 09:57:15 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 09:57:15 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 09:57:15 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 09:57:15 [block_pool.py:292] Successfully reset prefix cache
+
+ 22%|██▏       | 225/1024 [9:46:35<37:00:10, 166.72s/it][A
+                                                        [A{'loss': 0.0204, 'grad_norm': 0.0027519147843122482, 'learning_rate': 1e-05, 'num_tokens': 177586766.0, 'completions/mean_length': 6662.65625, 'completions/min_length': 486.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 6508.349609375, 'completions/min_terminated_length': 486.0, 'completions/max_terminated_length': 16030.0, 'rewards/accuracy_reward/mean': 0.421875, 'rewards/accuracy_reward/std': 0.4957992732524872, 'reward': 0.421875, 'reward_std': 0.21382881700992584, 'frac_reward_zero_std': 0.5625, 'sampling/sampling_logp_difference/mean': 0.020679686218500137, 'sampling/sampling_logp_difference/max': 10.593820571899414, 'sampling/importance_sampling_ratio/min': 2.507045428501442e-05, 'sampling/importance_sampling_ratio/mean': 1.0000051259994507, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9501350447535515, 'clip_ratio/low_mean': 2.103693077515345e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 1.3528137969842646e-06, 'clip_ratio/high_max': 5.4112551879370585e-06, 'clip_ratio/region_mean': 2.2389744572137715e-05, 'epoch': 0.21}
+
+ 22%|██▏       | 225/1024 [9:46:35<37:00:10, 166.72s/it][AINFO 12-02 10:00:08 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 10:00:08 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 10:00:08 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 10:00:08 [block_pool.py:292] Successfully reset prefix cache
+
+ 22%|██▏       | 226/1024 [9:49:17<36:40:38, 165.46s/it][A
+                                                        [A{'loss': 0.0748, 'grad_norm': 0.0017936143558472395, 'learning_rate': 1e-05, 'num_tokens': 178444556.0, 'completions/mean_length': 6546.171875, 'completions/min_length': 839.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0390625, 'completions/mean_terminated_length': 6146.259765625, 'completions/min_terminated_length': 839.0, 'completions/max_terminated_length': 15419.0, 'rewards/accuracy_reward/mean': 0.3984375, 'rewards/accuracy_reward/std': 0.4915000796318054, 'reward': 0.3984375, 'reward_std': 0.2909066081047058, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.019764548167586327, 'sampling/sampling_logp_difference/max': 9.379026412963867, 'sampling/importance_sampling_ratio/min': 8.447741129202768e-05, 'sampling/importance_sampling_ratio/mean': 1.0000327825546265, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9217342138290405, 'clip_ratio/low_mean': 5.783435085504607e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 8.121946279970871e-07, 'clip_ratio/high_max': 3.2487785119883483e-06, 'clip_ratio/region_mean': 5.8646545539886574e-05, 'epoch': 0.21}
+
+ 22%|██▏       | 226/1024 [9:49:17<36:40:38, 165.46s/it][AINFO 12-02 10:02:50 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 10:02:50 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 10:02:50 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 10:02:50 [block_pool.py:292] Successfully reset prefix cache
+
+ 22%|██▏       | 227/1024 [9:51:50<35:47:14, 161.65s/it][A
+                                                        [A{'loss': 0.0505, 'grad_norm': 0.0031324021983891726, 'learning_rate': 1e-05, 'num_tokens': 179288499.0, 'completions/mean_length': 6433.9296875, 'completions/min_length': 731.0, 'completions/max_length': 15707.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 6433.9296875, 'completions/min_terminated_length': 731.0, 'completions/max_terminated_length': 15707.0, 'rewards/accuracy_reward/mean': 0.453125, 'rewards/accuracy_reward/std': 0.4997538626194, 'reward': 0.453125, 'reward_std': 0.3066929578781128, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.019691072404384613, 'sampling/sampling_logp_difference/max': 8.595767974853516, 'sampling/importance_sampling_ratio/min': 0.00018488657951820642, 'sampling/importance_sampling_ratio/mean': 0.9999454021453857, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9361409991979599, 'clip_ratio/low_mean': 4.3977801396977156e-05, 'clip_ratio/low_min': 7.912247156127705e-06, 'clip_ratio/high_mean': 5.4951232755229285e-06, 'clip_ratio/high_max': 2.1980493102091714e-05, 'clip_ratio/region_mean': 4.947292427459615e-05, 'epoch': 0.21}
+
+ 22%|██▏       | 227/1024 [9:51:50<35:47:14, 161.65s/it][AINFO 12-02 10:05:23 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 10:05:23 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 10:05:23 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 10:05:23 [block_pool.py:292] Successfully reset prefix cache
+
+ 22%|██▏       | 228/1024 [9:54:23<35:09:33, 159.01s/it][A
+                                                        [A{'loss': 0.0044, 'grad_norm': 0.0024714914616197348, 'learning_rate': 1e-05, 'num_tokens': 180079619.0, 'completions/mean_length': 6037.75, 'completions/min_length': 551.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 5873.52392578125, 'completions/min_terminated_length': 551.0, 'completions/max_terminated_length': 15787.0, 'rewards/accuracy_reward/mean': 0.484375, 'rewards/accuracy_reward/std': 0.5017194747924805, 'reward': 0.484375, 'reward_std': 0.21436560153961182, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.018519341945648193, 'sampling/sampling_logp_difference/max': 9.374723434448242, 'sampling/importance_sampling_ratio/min': 8.4841696661897e-05, 'sampling/importance_sampling_ratio/mean': 0.9999628067016602, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8700985535979271, 'clip_ratio/low_mean': 3.756406420052372e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 3.2485420433658874e-06, 'clip_ratio/high_max': 1.299416817346355e-05, 'clip_ratio/region_mean': 4.081260635757644e-05, 'epoch': 0.21}
+
+ 22%|██▏       | 228/1024 [9:54:23<35:09:33, 159.01s/it][AINFO 12-02 10:07:56 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 10:07:56 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 10:07:56 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 10:07:56 [block_pool.py:292] Successfully reset prefix cache
+
+ 22%|██▏       | 229/1024 [9:56:19<32:17:40, 146.24s/it][A
+                                                        [A{'loss': 0.0478, 'grad_norm': 0.003593914210796356, 'learning_rate': 1e-05, 'num_tokens': 180780877.0, 'completions/mean_length': 5305.828125, 'completions/min_length': 229.0, 'completions/max_length': 12264.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 5305.828125, 'completions/min_terminated_length': 229.0, 'completions/max_terminated_length': 12264.0, 'rewards/accuracy_reward/mean': 0.3984375, 'rewards/accuracy_reward/std': 0.4915000796318054, 'reward': 0.3984375, 'reward_std': 0.24671241641044617, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.020471621304750443, 'sampling/sampling_logp_difference/max': 4.611040115356445, 'sampling/importance_sampling_ratio/min': 0.009941472671926022, 'sampling/importance_sampling_ratio/mean': 1.000011920928955, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.1309608668088913, 'clip_ratio/low_mean': 2.2305866423266707e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 1.8233268974654493e-06, 'clip_ratio/high_max': 7.293307589861797e-06, 'clip_ratio/region_mean': 2.412919320704532e-05, 'epoch': 0.21}
+
+ 22%|██▏       | 229/1024 [9:56:19<32:17:40, 146.24s/it][AINFO 12-02 10:09:52 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 10:09:52 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 10:09:52 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 10:09:52 [block_pool.py:292] Successfully reset prefix cache
+
+ 22%|██▏       | 230/1024 [9:58:55<32:53:30, 149.13s/it][A
+                                                        [A{'loss': 0.0913, 'grad_norm': 0.0029816587921231985, 'learning_rate': 1e-05, 'num_tokens': 181571465.0, 'completions/mean_length': 6022.96875, 'completions/min_length': 556.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 5774.30419921875, 'completions/min_terminated_length': 556.0, 'completions/max_terminated_length': 13814.0, 'rewards/accuracy_reward/mean': 0.515625, 'rewards/accuracy_reward/std': 0.5017194747924805, 'reward': 0.515625, 'reward_std': 0.41504397988319397, 'frac_reward_zero_std': 0.125, 'sampling/sampling_logp_difference/mean': 0.0181986466050148, 'sampling/sampling_logp_difference/max': 11.04552936553955, 'sampling/importance_sampling_ratio/min': 1.5958334188326262e-05, 'sampling/importance_sampling_ratio/mean': 0.9999518394470215, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8560900762677193, 'clip_ratio/low_mean': 4.4980357415624894e-05, 'clip_ratio/low_min': 1.0012816346716136e-05, 'clip_ratio/high_mean': 5.040853750415408e-06, 'clip_ratio/high_max': 2.0163415001661633e-05, 'clip_ratio/region_mean': 5.0021211109196884e-05, 'epoch': 0.21}
+
+ 22%|██▏       | 230/1024 [9:58:55<32:53:30, 149.13s/it][AINFO 12-02 10:12:28 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 10:12:28 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 10:12:28 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 10:12:28 [block_pool.py:292] Successfully reset prefix cache
+
+ 23%|██▎       | 231/1024 [10:01:29<33:09:45, 150.55s/it][A
+                                                         [A{'loss': 0.1317, 'grad_norm': 0.0031632622703909874, 'learning_rate': 1e-05, 'num_tokens': 182440957.0, 'completions/mean_length': 6647.71875, 'completions/min_length': 196.0, 'completions/max_length': 15130.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 6647.71875, 'completions/min_terminated_length': 196.0, 'completions/max_terminated_length': 15130.0, 'rewards/accuracy_reward/mean': 0.3828125, 'rewards/accuracy_reward/std': 0.4879830479621887, 'reward': 0.3828125, 'reward_std': 0.39902517199516296, 'frac_reward_zero_std': 0.125, 'sampling/sampling_logp_difference/mean': 0.01906408555805683, 'sampling/sampling_logp_difference/max': 11.124979019165039, 'sampling/importance_sampling_ratio/min': 1.4739508515049238e-05, 'sampling/importance_sampling_ratio/mean': 1.0000306367874146, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9455481320619583, 'clip_ratio/low_mean': 4.450247388376738e-05, 'clip_ratio/low_min': 4.840271230932558e-06, 'clip_ratio/high_mean': 6.177042905619601e-06, 'clip_ratio/high_max': 1.8430865566188004e-05, 'clip_ratio/region_mean': 5.067951724413433e-05, 'epoch': 0.21}
+
+ 23%|██▎       | 231/1024 [10:01:29<33:09:45, 150.55s/it][AINFO 12-02 10:15:02 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 10:15:02 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 10:15:02 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 10:15:02 [block_pool.py:292] Successfully reset prefix cache
+
+ 23%|██▎       | 232/1024 [10:04:40<35:48:52, 162.79s/it][A
+                                                         [A{'loss': 0.0346, 'grad_norm': 0.00162694591563195, 'learning_rate': 1e-05, 'num_tokens': 183332242.0, 'completions/mean_length': 6809.1640625, 'completions/min_length': 471.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 6500.29833984375, 'completions/min_terminated_length': 471.0, 'completions/max_terminated_length': 15988.0, 'rewards/accuracy_reward/mean': 0.421875, 'rewards/accuracy_reward/std': 0.4957992732524872, 'reward': 0.421875, 'reward_std': 0.33616161346435547, 'frac_reward_zero_std': 0.1875, 'sampling/sampling_logp_difference/mean': 0.021866722032427788, 'sampling/sampling_logp_difference/max': 12.369775772094727, 'sampling/importance_sampling_ratio/min': 4.244970114086755e-06, 'sampling/importance_sampling_ratio/mean': 1.0000290870666504, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.050546184182167, 'clip_ratio/low_mean': 6.042617155799235e-05, 'clip_ratio/low_min': 1.1000354334100848e-05, 'clip_ratio/high_mean': 5.7342632544532535e-06, 'clip_ratio/high_max': 2.2937053017813014e-05, 'clip_ratio/region_mean': 6.616043401663774e-05, 'epoch': 0.21}
+
+ 23%|██▎       | 232/1024 [10:04:40<35:48:52, 162.79s/it][AINFO 12-02 10:18:13 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 10:18:13 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 10:18:13 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 10:18:13 [block_pool.py:292] Successfully reset prefix cache
+
+ 23%|██▎       | 233/1024 [10:07:26<35:57:14, 163.63s/it][A
+                                                         [A{'loss': 0.1059, 'grad_norm': 0.0024887355975806713, 'learning_rate': 1e-05, 'num_tokens': 184225138.0, 'completions/mean_length': 6815.5, 'completions/min_length': 51.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 6506.83837890625, 'completions/min_terminated_length': 51.0, 'completions/max_terminated_length': 16348.0, 'rewards/accuracy_reward/mean': 0.328125, 'rewards/accuracy_reward/std': 0.4713755249977112, 'reward': 0.328125, 'reward_std': 0.2869548499584198, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.02019432932138443, 'sampling/sampling_logp_difference/max': 8.953315734863281, 'sampling/importance_sampling_ratio/min': 0.00012930770753882825, 'sampling/importance_sampling_ratio/mean': 0.9999393820762634, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.060033954679966, 'clip_ratio/low_mean': 2.4373607971028832e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 3.669602961053897e-06, 'clip_ratio/high_max': 1.4678411844215589e-05, 'clip_ratio/region_mean': 2.8043211159456405e-05, 'epoch': 0.21}
+
+ 23%|██▎       | 233/1024 [10:07:26<35:57:14, 163.63s/it][AINFO 12-02 10:20:59 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 10:20:59 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 10:20:59 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 10:20:59 [block_pool.py:292] Successfully reset prefix cache
+
+ 23%|██▎       | 234/1024 [10:10:08<35:48:41, 163.19s/it][A
+                                                         [A{'loss': 0.0624, 'grad_norm': 0.002783838426694274, 'learning_rate': 1e-05, 'num_tokens': 185136323.0, 'completions/mean_length': 6928.4453125, 'completions/min_length': 304.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 6623.42724609375, 'completions/min_terminated_length': 304.0, 'completions/max_terminated_length': 15838.0, 'rewards/accuracy_reward/mean': 0.3359375, 'rewards/accuracy_reward/std': 0.47417303919792175, 'reward': 0.3359375, 'reward_std': 0.25460803508758545, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.01905050128698349, 'sampling/sampling_logp_difference/max': 11.498395919799805, 'sampling/importance_sampling_ratio/min': 1.0146355634788051e-05, 'sampling/importance_sampling_ratio/mean': 0.9999524354934692, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9051575735211372, 'clip_ratio/low_mean': 3.8802519611635944e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 1.9777228317252593e-06, 'clip_ratio/high_max': 7.910891326901037e-06, 'clip_ratio/region_mean': 4.078024221598753e-05, 'epoch': 0.22}
+
+ 23%|██▎       | 234/1024 [10:10:08<35:48:41, 163.19s/it][AINFO 12-02 10:23:41 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 10:23:41 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 10:23:41 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 10:23:41 [block_pool.py:292] Successfully reset prefix cache
+
+ 23%|██▎       | 235/1024 [10:12:41<35:06:02, 160.16s/it][A
+                                                         [A{'loss': 0.0655, 'grad_norm': 0.002105508930981159, 'learning_rate': 1e-05, 'num_tokens': 186071324.0, 'completions/mean_length': 7155.1328125, 'completions/min_length': 111.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 7082.46435546875, 'completions/min_terminated_length': 111.0, 'completions/max_terminated_length': 14703.0, 'rewards/accuracy_reward/mean': 0.328125, 'rewards/accuracy_reward/std': 0.4713755249977112, 'reward': 0.328125, 'reward_std': 0.26303553581237793, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.020860780030488968, 'sampling/sampling_logp_difference/max': 5.656649112701416, 'sampling/importance_sampling_ratio/min': 0.003494206117466092, 'sampling/importance_sampling_ratio/mean': 0.999904990196228, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.0119014978408813, 'clip_ratio/low_mean': 1.733424267058581e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 1.0998486459357082e-06, 'clip_ratio/high_max': 4.399394583742833e-06, 'clip_ratio/region_mean': 1.8434091430208355e-05, 'epoch': 0.22}
+
+ 23%|██▎       | 235/1024 [10:12:41<35:06:02, 160.16s/it][AINFO 12-02 10:26:14 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 10:26:14 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 10:26:14 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 10:26:14 [block_pool.py:292] Successfully reset prefix cache
+
+ 23%|██▎       | 236/1024 [10:15:38<36:08:01, 165.08s/it][A
+                                                         [A{'loss': 0.048, 'grad_norm': 0.0013582308311015368, 'learning_rate': 1e-05, 'num_tokens': 187045035.0, 'completions/mean_length': 7463.2421875, 'completions/min_length': 698.0, 'completions/max_length': 16176.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 7463.2421875, 'completions/min_terminated_length': 698.0, 'completions/max_terminated_length': 16176.0, 'rewards/accuracy_reward/mean': 0.3984375, 'rewards/accuracy_reward/std': 0.4915000796318054, 'reward': 0.3984375, 'reward_std': 0.2517249584197998, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.021131811663508415, 'sampling/sampling_logp_difference/max': 7.65624475479126, 'sampling/importance_sampling_ratio/min': 0.000473080639494583, 'sampling/importance_sampling_ratio/mean': 0.9999428987503052, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9983502700924873, 'clip_ratio/low_mean': 2.8499469067355676e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 3.4390433256703545e-06, 'clip_ratio/high_max': 1.0561529961705673e-05, 'clip_ratio/region_mean': 3.193851205196552e-05, 'epoch': 0.22}
+
+ 23%|██▎       | 236/1024 [10:15:38<36:08:01, 165.08s/it][AINFO 12-02 10:29:11 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 10:29:11 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 10:29:11 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 10:29:11 [block_pool.py:292] Successfully reset prefix cache
+
+ 23%|██▎       | 237/1024 [10:18:09<35:08:26, 160.75s/it][A
+                                                         [A{'loss': 0.0987, 'grad_norm': 0.002681629965081811, 'learning_rate': 1e-05, 'num_tokens': 187889609.0, 'completions/mean_length': 6460.984375, 'completions/min_length': 1747.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 6382.8505859375, 'completions/min_terminated_length': 1747.0, 'completions/max_terminated_length': 14938.0, 'rewards/accuracy_reward/mean': 0.5234375, 'rewards/accuracy_reward/std': 0.5014128684997559, 'reward': 0.5234375, 'reward_std': 0.39082521200180054, 'frac_reward_zero_std': 0.1875, 'sampling/sampling_logp_difference/mean': 0.016937749460339546, 'sampling/sampling_logp_difference/max': 6.499812602996826, 'sampling/importance_sampling_ratio/min': 0.0015037209959700704, 'sampling/importance_sampling_ratio/mean': 0.9999568462371826, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.7869217246770859, 'clip_ratio/low_mean': 3.985050443588989e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 2.127253367234516e-06, 'clip_ratio/high_max': 8.509013468938065e-06, 'clip_ratio/region_mean': 4.197775751890731e-05, 'epoch': 0.22}
+
+ 23%|██▎       | 237/1024 [10:18:09<35:08:26, 160.75s/it][AINFO 12-02 10:31:41 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 10:31:41 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 10:31:41 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 10:31:41 [block_pool.py:292] Successfully reset prefix cache
+
+ 23%|██▎       | 238/1024 [10:20:39<34:25:43, 157.69s/it][A
+                                                         [A{'loss': 0.0353, 'grad_norm': 0.0021239183843135834, 'learning_rate': 1e-05, 'num_tokens': 188706605.0, 'completions/mean_length': 6241.78125, 'completions/min_length': 548.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 6161.92138671875, 'completions/min_terminated_length': 548.0, 'completions/max_terminated_length': 15893.0, 'rewards/accuracy_reward/mean': 0.2578125, 'rewards/accuracy_reward/std': 0.43914902210235596, 'reward': 0.2578125, 'reward_std': 0.3135277330875397, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.02103862166404724, 'sampling/sampling_logp_difference/max': 5.328148365020752, 'sampling/importance_sampling_ratio/min': 0.004853047896176577, 'sampling/importance_sampling_ratio/mean': 0.9999796748161316, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.0217387825250626, 'clip_ratio/low_mean': 5.0333514764133724e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 3.0905440553397057e-06, 'clip_ratio/high_max': 1.2362176221358823e-05, 'clip_ratio/region_mean': 5.342405825103924e-05, 'epoch': 0.22}
+
+ 23%|██▎       | 238/1024 [10:20:39<34:25:43, 157.69s/it][AINFO 12-02 10:34:12 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 10:34:12 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 10:34:12 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 10:34:12 [block_pool.py:292] Successfully reset prefix cache
+
+ 23%|██▎       | 239/1024 [10:23:06<33:42:26, 154.58s/it][A
+                                                         [A{'loss': 0.0458, 'grad_norm': 0.0023868419229984283, 'learning_rate': 1e-05, 'num_tokens': 189446294.0, 'completions/mean_length': 5638.1328125, 'completions/min_length': 66.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 5553.51953125, 'completions/min_terminated_length': 66.0, 'completions/max_terminated_length': 15332.0, 'rewards/accuracy_reward/mean': 0.515625, 'rewards/accuracy_reward/std': 0.5017194747924805, 'reward': 0.515625, 'reward_std': 0.31246688961982727, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.017401430755853653, 'sampling/sampling_logp_difference/max': 7.124982833862305, 'sampling/importance_sampling_ratio/min': 0.0008047468145377934, 'sampling/importance_sampling_ratio/mean': 1.0000369548797607, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.7844365313649178, 'clip_ratio/low_mean': 3.437372129155847e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 1.681282583376742e-06, 'clip_ratio/high_max': 6.725130333506968e-06, 'clip_ratio/region_mean': 3.605500387493521e-05, 'epoch': 0.22}
+
+ 23%|██▎       | 239/1024 [10:23:06<33:42:26, 154.58s/it][AINFO 12-02 10:36:39 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 10:36:39 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 10:36:39 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 10:36:39 [block_pool.py:292] Successfully reset prefix cache
+
+ 23%|██▎       | 240/1024 [10:26:08<35:24:14, 162.57s/it][A
+                                                         [A{'loss': 0.0627, 'grad_norm': 0.0027549315709620714, 'learning_rate': 1e-05, 'num_tokens': 190281461.0, 'completions/mean_length': 6374.8046875, 'completions/min_length': 722.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 6215.9287109375, 'completions/min_terminated_length': 722.0, 'completions/max_terminated_length': 16196.0, 'rewards/accuracy_reward/mean': 0.3984375, 'rewards/accuracy_reward/std': 0.4915000796318054, 'reward': 0.3984375, 'reward_std': 0.3167053163051605, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.020243138074874878, 'sampling/sampling_logp_difference/max': 9.552709579467773, 'sampling/importance_sampling_ratio/min': 7.100860239006579e-05, 'sampling/importance_sampling_ratio/mean': 0.9998682737350464, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9472770467400551, 'clip_ratio/low_mean': 2.826443028425274e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 7.219325539153942e-06, 'clip_ratio/high_max': 2.887730215661577e-05, 'clip_ratio/region_mean': 3.548375502759882e-05, 'epoch': 0.22}
+
+ 23%|██▎       | 240/1024 [10:26:08<35:24:14, 162.57s/it][AINFO 12-02 10:39:40 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 10:39:40 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 10:39:40 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 10:39:40 [block_pool.py:292] Successfully reset prefix cache
+
+ 24%|██▎       | 241/1024 [10:28:52<35:27:27, 163.02s/it][A
+                                                         [A{'loss': 0.0184, 'grad_norm': 0.0013344973558560014, 'learning_rate': 1e-05, 'num_tokens': 191156249.0, 'completions/mean_length': 6654.21875, 'completions/min_length': 246.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 6499.88134765625, 'completions/min_terminated_length': 246.0, 'completions/max_terminated_length': 15097.0, 'rewards/accuracy_reward/mean': 0.359375, 'rewards/accuracy_reward/std': 0.481702595949173, 'reward': 0.359375, 'reward_std': 0.22832971811294556, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.020470600575208664, 'sampling/sampling_logp_difference/max': 6.124969959259033, 'sampling/importance_sampling_ratio/min': 0.0021875568199902773, 'sampling/importance_sampling_ratio/mean': 0.9999619722366333, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.0028243213891983, 'clip_ratio/low_mean': 2.978218674343225e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 3.9669694160693325e-06, 'clip_ratio/high_max': 1.586787766427733e-05, 'clip_ratio/region_mean': 3.374915604581474e-05, 'epoch': 0.22}
+
+ 24%|██▎       | 241/1024 [10:28:52<35:27:27, 163.02s/it][AINFO 12-02 10:42:25 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 10:42:25 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 10:42:25 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 10:42:25 [block_pool.py:292] Successfully reset prefix cache
+
+ 24%|██▎       | 242/1024 [10:31:38<35:38:27, 164.08s/it][A
+                                                         [A{'loss': 0.0816, 'grad_norm': 0.0016136945923790336, 'learning_rate': 1e-05, 'num_tokens': 192040526.0, 'completions/mean_length': 6767.7890625, 'completions/min_length': 132.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 6537.00048828125, 'completions/min_terminated_length': 132.0, 'completions/max_terminated_length': 16172.0, 'rewards/accuracy_reward/mean': 0.4921875, 'rewards/accuracy_reward/std': 0.5019033551216125, 'reward': 0.4921875, 'reward_std': 0.2909066081047058, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.020058143883943558, 'sampling/sampling_logp_difference/max': 11.29355525970459, 'sampling/importance_sampling_ratio/min': 1.2452921509975567e-05, 'sampling/importance_sampling_ratio/mean': 0.9999668598175049, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9059296399354935, 'clip_ratio/low_mean': 2.040554932136729e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 4.9954849146160996e-06, 'clip_ratio/high_max': 1.681529829511419e-05, 'clip_ratio/region_mean': 2.5401033553862362e-05, 'epoch': 0.22}
+
+ 24%|██▎       | 242/1024 [10:31:38<35:38:27, 164.08s/it][AINFO 12-02 10:45:11 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 10:45:11 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 10:45:11 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 10:45:11 [block_pool.py:292] Successfully reset prefix cache
+
+ 24%|██▎       | 243/1024 [10:34:21<35:31:01, 163.71s/it][A
+                                                         [A{'loss': 0.0212, 'grad_norm': 0.002869367366656661, 'learning_rate': 1e-05, 'num_tokens': 192926469.0, 'completions/mean_length': 6767.4921875, 'completions/min_length': 998.0, 'completions/max_length': 16275.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 6767.4921875, 'completions/min_terminated_length': 998.0, 'completions/max_terminated_length': 16275.0, 'rewards/accuracy_reward/mean': 0.3828125, 'rewards/accuracy_reward/std': 0.4879830479621887, 'reward': 0.3828125, 'reward_std': 0.2517249882221222, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.021512050181627274, 'sampling/sampling_logp_difference/max': 10.853924751281738, 'sampling/importance_sampling_ratio/min': 1.9328599591972306e-05, 'sampling/importance_sampling_ratio/mean': 0.9999586343765259, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.0446822568774223, 'clip_ratio/low_mean': 2.9821966563758906e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 0.0, 'clip_ratio/high_max': 0.0, 'clip_ratio/region_mean': 2.9821966563758906e-05, 'epoch': 0.22}
+
+ 24%|██▎       | 243/1024 [10:34:21<35:31:01, 163.71s/it][AINFO 12-02 10:47:54 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 10:47:54 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 10:47:54 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 10:47:54 [block_pool.py:292] Successfully reset prefix cache
+
+ 24%|██▍       | 244/1024 [10:36:51<34:36:11, 159.71s/it][A
+                                                         [A{'loss': -0.0189, 'grad_norm': 0.002177527640014887, 'learning_rate': 1e-05, 'num_tokens': 193678859.0, 'completions/mean_length': 5738.484375, 'completions/min_length': 184.0, 'completions/max_length': 16261.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 5738.484375, 'completions/min_terminated_length': 184.0, 'completions/max_terminated_length': 16261.0, 'rewards/accuracy_reward/mean': 0.5546875, 'rewards/accuracy_reward/std': 0.4989531338214874, 'reward': 0.5546875, 'reward_std': 0.33220988512039185, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.018141131848096848, 'sampling/sampling_logp_difference/max': 7.06630802154541, 'sampling/importance_sampling_ratio/min': 0.0008533780346624553, 'sampling/importance_sampling_ratio/mean': 0.9999570846557617, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8617956340312958, 'clip_ratio/low_mean': 3.6407937841431703e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 1.3001711295146379e-05, 'clip_ratio/high_max': 3.44581130775623e-05, 'clip_ratio/region_mean': 4.940964981869911e-05, 'epoch': 0.22}
+
+ 24%|██▍       | 244/1024 [10:36:51<34:36:11, 159.71s/it][AINFO 12-02 10:50:24 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 10:50:24 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 10:50:24 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 10:50:24 [block_pool.py:292] Successfully reset prefix cache
+
+ 24%|██▍       | 245/1024 [10:39:32<34:35:54, 159.89s/it][A
+                                                         [A{'loss': 0.0572, 'grad_norm': 0.004127771593630314, 'learning_rate': 1e-05, 'num_tokens': 194511847.0, 'completions/mean_length': 6382.90625, 'completions/min_length': 464.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0390625, 'completions/mean_terminated_length': 5976.357421875, 'completions/min_terminated_length': 464.0, 'completions/max_terminated_length': 15595.0, 'rewards/accuracy_reward/mean': 0.4140625, 'rewards/accuracy_reward/std': 0.49449479579925537, 'reward': 0.4140625, 'reward_std': 0.2767002582550049, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.018376430496573448, 'sampling/sampling_logp_difference/max': 12.124691009521484, 'sampling/importance_sampling_ratio/min': 5.4239239943854045e-06, 'sampling/importance_sampling_ratio/mean': 0.9998810291290283, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8692388981580734, 'clip_ratio/low_mean': 2.7767115511778684e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 9.652509334046044e-07, 'clip_ratio/high_max': 3.861003733618418e-06, 'clip_ratio/region_mean': 2.8732366558870126e-05, 'epoch': 0.23}
+
+ 24%|██▍       | 245/1024 [10:39:32<34:35:54, 159.89s/it][AINFO 12-02 10:53:05 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 10:53:05 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 10:53:05 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 10:53:05 [block_pool.py:292] Successfully reset prefix cache
+
+ 24%|██▍       | 246/1024 [10:41:49<33:04:52, 153.07s/it][A
+                                                         [A{'loss': 0.0687, 'grad_norm': 0.00263008801266551, 'learning_rate': 1e-05, 'num_tokens': 195270051.0, 'completions/mean_length': 5776.15625, 'completions/min_length': 1018.0, 'completions/max_length': 14504.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 5776.15625, 'completions/min_terminated_length': 1018.0, 'completions/max_terminated_length': 14504.0, 'rewards/accuracy_reward/mean': 0.421875, 'rewards/accuracy_reward/std': 0.4957992732524872, 'reward': 0.421875, 'reward_std': 0.3618982434272766, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.019923292100429535, 'sampling/sampling_logp_difference/max': 5.257286548614502, 'sampling/importance_sampling_ratio/min': 0.005209421273320913, 'sampling/importance_sampling_ratio/mean': 0.999971866607666, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.1195004731416702, 'clip_ratio/low_mean': 5.3631663831765763e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 2.4320988813997246e-06, 'clip_ratio/high_max': 9.728395525598899e-06, 'clip_ratio/region_mean': 5.606376271316549e-05, 'epoch': 0.23}
+
+ 24%|██▍       | 246/1024 [10:41:49<33:04:52, 153.07s/it][AINFO 12-02 10:55:22 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 10:55:22 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 10:55:22 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 10:55:22 [block_pool.py:292] Successfully reset prefix cache
+
+ 24%|██▍       | 247/1024 [10:44:36<33:57:50, 157.36s/it][A
+                                                         [A{'loss': 0.07, 'grad_norm': 0.002642859937623143, 'learning_rate': 1e-05, 'num_tokens': 196240913.0, 'completions/mean_length': 7411.421875, 'completions/min_length': 455.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 7196.08056640625, 'completions/min_terminated_length': 455.0, 'completions/max_terminated_length': 15203.0, 'rewards/accuracy_reward/mean': 0.390625, 'rewards/accuracy_reward/std': 0.4898075461387634, 'reward': 0.390625, 'reward_std': 0.27328529953956604, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.021511007100343704, 'sampling/sampling_logp_difference/max': 8.650712966918945, 'sampling/importance_sampling_ratio/min': 0.00017500204558018595, 'sampling/importance_sampling_ratio/mean': 0.9999198913574219, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9801053553819656, 'clip_ratio/low_mean': 4.2162768181697174e-05, 'clip_ratio/low_min': 3.873926743835909e-06, 'clip_ratio/high_mean': 3.1752751965541393e-06, 'clip_ratio/high_max': 1.2701100786216557e-05, 'clip_ratio/region_mean': 4.5338043378251314e-05, 'epoch': 0.23}
+
+ 24%|██▍       | 247/1024 [10:44:36<33:57:50, 157.36s/it][AINFO 12-02 10:58:09 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 10:58:09 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 10:58:09 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 10:58:09 [block_pool.py:292] Successfully reset prefix cache
+
+ 24%|██▍       | 248/1024 [10:47:48<36:10:22, 167.81s/it][A
+                                                         [A{'loss': 0.0564, 'grad_norm': 0.002439325675368309, 'learning_rate': 1e-05, 'num_tokens': 197278517.0, 'completions/mean_length': 7944.65625, 'completions/min_length': 144.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 7742.1123046875, 'completions/min_terminated_length': 144.0, 'completions/max_terminated_length': 16127.0, 'rewards/accuracy_reward/mean': 0.34375, 'rewards/accuracy_reward/std': 0.47682511806488037, 'reward': 0.34375, 'reward_std': 0.3161812424659729, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.02124868705868721, 'sampling/sampling_logp_difference/max': 11.49896240234375, 'sampling/importance_sampling_ratio/min': 1.0140610356756952e-05, 'sampling/importance_sampling_ratio/mean': 0.9999248385429382, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.0132562816143036, 'clip_ratio/low_mean': 6.367217611114029e-05, 'clip_ratio/low_min': 4.8010447244450916e-06, 'clip_ratio/high_mean': 3.780734232350369e-06, 'clip_ratio/high_max': 1.5122936929401476e-05, 'clip_ratio/region_mean': 6.745291057086433e-05, 'epoch': 0.23}
+
+ 24%|██▍       | 248/1024 [10:47:48<36:10:22, 167.81s/it][AINFO 12-02 11:01:21 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 11:01:21 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 11:01:21 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 11:01:21 [block_pool.py:292] Successfully reset prefix cache
+
+ 24%|██▍       | 249/1024 [10:50:49<36:58:15, 171.74s/it][A
+                                                         [A{'loss': 0.0338, 'grad_norm': 0.0014879995724186301, 'learning_rate': 1e-05, 'num_tokens': 198265589.0, 'completions/mean_length': 7550.0, 'completions/min_length': 1469.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 7409.7783203125, 'completions/min_terminated_length': 1469.0, 'completions/max_terminated_length': 16317.0, 'rewards/accuracy_reward/mean': 0.3359375, 'rewards/accuracy_reward/std': 0.47417303919792175, 'reward': 0.3359375, 'reward_std': 0.24040167033672333, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.021503347903490067, 'sampling/sampling_logp_difference/max': 9.382474899291992, 'sampling/importance_sampling_ratio/min': 8.418659126618877e-05, 'sampling/importance_sampling_ratio/mean': 0.9999468922615051, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.0384011715650558, 'clip_ratio/low_mean': 3.7791321346958284e-05, 'clip_ratio/low_min': 3.2110563097376144e-06, 'clip_ratio/high_mean': 6.504384089112136e-06, 'clip_ratio/high_max': 2.6017536356448545e-05, 'clip_ratio/region_mean': 4.429570503816649e-05, 'epoch': 0.23}
+
+ 24%|██▍       | 249/1024 [10:50:49<36:58:15, 171.74s/it][AINFO 12-02 11:04:22 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 11:04:22 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 11:04:22 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 11:04:22 [block_pool.py:292] Successfully reset prefix cache
+
+ 24%|██▍       | 250/1024 [10:54:10<38:48:03, 180.47s/it][A
+                                                         [A{'loss': 0.0585, 'grad_norm': 0.002340668346732855, 'learning_rate': 1e-05, 'num_tokens': 199324938.0, 'completions/mean_length': 8140.9140625, 'completions/min_length': 837.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0703125, 'completions/mean_terminated_length': 7517.48779296875, 'completions/min_terminated_length': 837.0, 'completions/max_terminated_length': 16221.0, 'rewards/accuracy_reward/mean': 0.453125, 'rewards/accuracy_reward/std': 0.4997538626194, 'reward': 0.453125, 'reward_std': 0.35824596881866455, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.019466478377580643, 'sampling/sampling_logp_difference/max': 6.064027786254883, 'sampling/importance_sampling_ratio/min': 0.002325017238035798, 'sampling/importance_sampling_ratio/mean': 0.9999454021453857, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8718572407960892, 'clip_ratio/low_mean': 3.916533574965797e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 4.491880531531933e-06, 'clip_ratio/high_max': 1.3615457191917812e-05, 'clip_ratio/region_mean': 4.365721684962409e-05, 'epoch': 0.23}
+
+ 24%|██▍       | 250/1024 [10:54:10<38:48:03, 180.47s/it][AINFO 12-02 11:07:43 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 11:07:43 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 11:07:43 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 11:07:43 [block_pool.py:292] Successfully reset prefix cache
+
+ 25%|██▍       | 251/1024 [10:56:52<37:31:51, 174.79s/it][A
+                                                         [A{'loss': 0.0218, 'grad_norm': 0.001989356242120266, 'learning_rate': 1e-05, 'num_tokens': 200189902.0, 'completions/mean_length': 6630.96875, 'completions/min_length': 116.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 6396.896484375, 'completions/min_terminated_length': 116.0, 'completions/max_terminated_length': 16263.0, 'rewards/accuracy_reward/mean': 0.5625, 'rewards/accuracy_reward/std': 0.49802759289741516, 'reward': 0.5625, 'reward_std': 0.2987973093986511, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.01849902793765068, 'sampling/sampling_logp_difference/max': 8.011649131774902, 'sampling/importance_sampling_ratio/min': 0.0003315774374641478, 'sampling/importance_sampling_ratio/mean': 0.9999474883079529, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.7798146530985832, 'clip_ratio/low_mean': 4.1318608055007644e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 5.543924260109634e-06, 'clip_ratio/high_max': 2.2175697040438536e-05, 'clip_ratio/region_mean': 4.686253225827386e-05, 'epoch': 0.23}
+
+ 25%|██▍       | 251/1024 [10:56:52<37:31:51, 174.79s/it][AINFO 12-02 11:10:25 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 11:10:25 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 11:10:25 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 11:10:25 [block_pool.py:292] Successfully reset prefix cache
+
+ 25%|██▍       | 252/1024 [10:59:17<35:34:58, 165.93s/it][A
+                                                         [A{'loss': 0.0232, 'grad_norm': 0.002594445599243045, 'learning_rate': 1e-05, 'num_tokens': 201052832.0, 'completions/mean_length': 6582.203125, 'completions/min_length': 593.0, 'completions/max_length': 15357.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 6582.203125, 'completions/min_terminated_length': 593.0, 'completions/max_terminated_length': 15357.0, 'rewards/accuracy_reward/mean': 0.34375, 'rewards/accuracy_reward/std': 0.47682511806488037, 'reward': 0.34375, 'reward_std': 0.25460314750671387, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.021598614752292633, 'sampling/sampling_logp_difference/max': 7.8613433837890625, 'sampling/importance_sampling_ratio/min': 0.0003853558446280658, 'sampling/importance_sampling_ratio/mean': 0.9999495148658752, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.0181676000356674, 'clip_ratio/low_mean': 2.0285911205064622e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 8.31426575587102e-07, 'clip_ratio/high_max': 3.325706302348408e-06, 'clip_ratio/region_mean': 2.111733795118198e-05, 'epoch': 0.23}
+
+ 25%|██▍       | 252/1024 [10:59:17<35:34:58, 165.93s/it][AINFO 12-02 11:12:50 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 11:12:50 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 11:12:50 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 11:12:50 [block_pool.py:292] Successfully reset prefix cache
+
+ 25%|██▍       | 253/1024 [11:01:45<34:23:58, 160.62s/it][A
+                                                         [A{'loss': 0.0811, 'grad_norm': 0.003104996867477894, 'learning_rate': 1e-05, 'num_tokens': 201858047.0, 'completions/mean_length': 6145.1796875, 'completions/min_length': 1098.0, 'completions/max_length': 14540.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 6145.1796875, 'completions/min_terminated_length': 1098.0, 'completions/max_terminated_length': 14540.0, 'rewards/accuracy_reward/mean': 0.5078125, 'rewards/accuracy_reward/std': 0.5019033551216125, 'reward': 0.5078125, 'reward_std': 0.33220985531806946, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.018979094922542572, 'sampling/sampling_logp_difference/max': 4.87296724319458, 'sampling/importance_sampling_ratio/min': 0.007650630082935095, 'sampling/importance_sampling_ratio/mean': 1.000011682510376, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9084350541234016, 'clip_ratio/low_mean': 3.4155824209847196e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 5.511107588063169e-06, 'clip_ratio/high_max': 2.2044430352252675e-05, 'clip_ratio/region_mean': 3.96669319115972e-05, 'epoch': 0.23}
+
+ 25%|██▍       | 253/1024 [11:01:45<34:23:58, 160.62s/it][AINFO 12-02 11:15:18 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 11:15:18 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 11:15:18 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 11:15:18 [block_pool.py:292] Successfully reset prefix cache
+
+ 25%|██▍       | 254/1024 [11:04:30<34:36:47, 161.83s/it][A
+                                                         [A{'loss': 0.0133, 'grad_norm': 0.003212577663362026, 'learning_rate': 1e-05, 'num_tokens': 202807673.0, 'completions/mean_length': 7259.953125, 'completions/min_length': 960.0, 'completions/max_length': 15745.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 7259.953125, 'completions/min_terminated_length': 960.0, 'completions/max_terminated_length': 15745.0, 'rewards/accuracy_reward/mean': 0.4765625, 'rewards/accuracy_reward/std': 0.5014128684997559, 'reward': 0.4765625, 'reward_std': 0.3056321144104004, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.021432969719171524, 'sampling/sampling_logp_difference/max': 7.530435085296631, 'sampling/importance_sampling_ratio/min': 0.000536504783667624, 'sampling/importance_sampling_ratio/mean': 0.999860405921936, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9823614731431007, 'clip_ratio/low_mean': 3.5255963325653283e-05, 'clip_ratio/low_min': 2.973075879708631e-06, 'clip_ratio/high_mean': 7.489994629850116e-06, 'clip_ratio/high_max': 2.9959978519400465e-05, 'clip_ratio/region_mean': 4.274595892184152e-05, 'epoch': 0.23}
+
+ 25%|██▍       | 254/1024 [11:04:30<34:36:47, 161.83s/it][AINFO 12-02 11:18:03 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 11:18:03 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 11:18:03 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 11:18:03 [block_pool.py:292] Successfully reset prefix cache
+
+ 25%|██▍       | 255/1024 [11:07:32<35:51:49, 167.89s/it][A
+                                                         [A{'loss': 0.0411, 'grad_norm': 0.001418307889252901, 'learning_rate': 1e-05, 'num_tokens': 203757333.0, 'completions/mean_length': 7258.71875, 'completions/min_length': 4.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 7113.87353515625, 'completions/min_terminated_length': 4.0, 'completions/max_terminated_length': 16369.0, 'rewards/accuracy_reward/mean': 0.40625, 'rewards/accuracy_reward/std': 0.4930621087551117, 'reward': 0.40625, 'reward_std': 0.3048579692840576, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.019296500831842422, 'sampling/sampling_logp_difference/max': 7.3526411056518555, 'sampling/importance_sampling_ratio/min': 0.0006408974295482039, 'sampling/importance_sampling_ratio/mean': 0.9999884963035583, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8823810070753098, 'clip_ratio/low_mean': 2.9539680099333054e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 9.095591565255745e-06, 'clip_ratio/high_max': 3.273996276220714e-05, 'clip_ratio/region_mean': 3.8635271948805894e-05, 'epoch': 0.23}
+
+ 25%|██▍       | 255/1024 [11:07:32<35:51:49, 167.89s/it][AINFO 12-02 11:21:05 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 11:21:05 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 11:21:05 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 11:21:05 [block_pool.py:292] Successfully reset prefix cache
+
+ 25%|██▌       | 256/1024 [11:10:27<36:18:15, 170.18s/it][A
+                                                         [A{'loss': 0.0459, 'grad_norm': 0.001652427832596004, 'learning_rate': 1e-05, 'num_tokens': 204675065.0, 'completions/mean_length': 7011.40625, 'completions/min_length': 685.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0625, 'completions/mean_terminated_length': 6386.56689453125, 'completions/min_terminated_length': 685.0, 'completions/max_terminated_length': 15605.0, 'rewards/accuracy_reward/mean': 0.46875, 'rewards/accuracy_reward/std': 0.5009832978248596, 'reward': 0.46875, 'reward_std': 0.24146251380443573, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.018787402659654617, 'sampling/sampling_logp_difference/max': 4.178651332855225, 'sampling/importance_sampling_ratio/min': 0.015319154597818851, 'sampling/importance_sampling_ratio/mean': 0.9999918937683105, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8057166337966919, 'clip_ratio/low_mean': 3.755458698151415e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 3.860298420477193e-06, 'clip_ratio/high_max': 1.544119368190877e-05, 'clip_ratio/region_mean': 4.141488631148604e-05, 'epoch': 0.24}
+
+ 25%|██▌       | 256/1024 [11:10:27<36:18:15, 170.18s/it][AINFO 12-02 11:24:03 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 11:24:03 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 11:24:03 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 11:24:03 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 25%|██▌       | 257/1024 [11:13:12<35:52:05, 168.35s/it][A
+                                                         [A{'loss': 0.0677, 'grad_norm': 0.001482579973526299, 'learning_rate': 1e-05, 'num_tokens': 205494344.0, 'completions/mean_length': 6243.4296875, 'completions/min_length': 1023.0, 'completions/max_length': 14796.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 6243.4296875, 'completions/min_terminated_length': 1023.0, 'completions/max_terminated_length': 14796.0, 'rewards/accuracy_reward/mean': 0.5390625, 'rewards/accuracy_reward/std': 0.5004304051399231, 'reward': 0.5390625, 'reward_std': 0.28930407762527466, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.019727632403373718, 'sampling/sampling_logp_difference/max': 7.762371063232422, 'sampling/importance_sampling_ratio/min': 0.0004254466330166906, 'sampling/importance_sampling_ratio/mean': 0.9998942613601685, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9856048971414566, 'clip_ratio/low_mean': 4.0701652551433654e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 2.209917965956265e-06, 'clip_ratio/high_max': 5.222041181696113e-06, 'clip_ratio/region_mean': 4.291157006264257e-05, 'epoch': 0.24}
+
+ 25%|██▌       | 257/1024 [11:13:12<35:52:05, 168.35s/it][AINFO 12-02 11:26:44 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 11:26:44 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 11:26:44 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 11:26:44 [block_pool.py:292] Successfully reset prefix cache
+
+ 25%|██▌       | 258/1024 [11:16:14<36:44:16, 172.66s/it][A
+                                                         [A{'loss': 0.0312, 'grad_norm': 0.002006452763453126, 'learning_rate': 1e-05, 'num_tokens': 206428775.0, 'completions/mean_length': 7122.2421875, 'completions/min_length': 445.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0546875, 'completions/mean_terminated_length': 6586.4375, 'completions/min_terminated_length': 445.0, 'completions/max_terminated_length': 15485.0, 'rewards/accuracy_reward/mean': 0.40625, 'rewards/accuracy_reward/std': 0.4930621087551117, 'reward': 0.40625, 'reward_std': 0.2987973093986511, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.01927522011101246, 'sampling/sampling_logp_difference/max': 9.123116493225098, 'sampling/importance_sampling_ratio/min': 0.00010911409481195733, 'sampling/importance_sampling_ratio/mean': 0.9999338388442993, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8625433370471001, 'clip_ratio/low_mean': 6.842733455414418e-05, 'clip_ratio/low_min': 9.297655878981459e-06, 'clip_ratio/high_mean': 0.0, 'clip_ratio/high_max': 0.0, 'clip_ratio/region_mean': 6.842733455414418e-05, 'epoch': 0.24}
+
+ 25%|██▌       | 258/1024 [11:16:14<36:44:16, 172.66s/it][AINFO 12-02 11:29:47 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 11:29:47 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 11:29:47 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 11:29:47 [block_pool.py:292] Successfully reset prefix cache
+
+ 25%|██▌       | 259/1024 [11:19:11<36:56:13, 173.82s/it][A
+                                                         [A{'loss': 0.0179, 'grad_norm': 0.0029176415409892797, 'learning_rate': 1e-05, 'num_tokens': 207469586.0, 'completions/mean_length': 7965.2734375, 'completions/min_length': 399.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0390625, 'completions/mean_terminated_length': 7623.6826171875, 'completions/min_terminated_length': 399.0, 'completions/max_terminated_length': 15254.0, 'rewards/accuracy_reward/mean': 0.3828125, 'rewards/accuracy_reward/std': 0.4879830479621887, 'reward': 0.3828125, 'reward_std': 0.2212003916501999, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.02198987640440464, 'sampling/sampling_logp_difference/max': 12.414315223693848, 'sampling/importance_sampling_ratio/min': 4.06005028708023e-06, 'sampling/importance_sampling_ratio/mean': 0.9998945593833923, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.0068430602550507, 'clip_ratio/low_mean': 2.7790995090981596e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 7.219018016257905e-06, 'clip_ratio/high_max': 2.887607206503162e-05, 'clip_ratio/region_mean': 3.501001378936053e-05, 'epoch': 0.24}
+
+ 25%|██▌       | 259/1024 [11:19:11<36:56:13, 173.82s/it][AINFO 12-02 11:32:44 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 11:32:44 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 11:32:44 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 11:32:44 [block_pool.py:292] Successfully reset prefix cache
+
+ 25%|██▌       | 260/1024 [11:22:11<37:16:52, 175.67s/it][A
+                                                         [A{'loss': 0.0399, 'grad_norm': 0.0019115234026685357, 'learning_rate': 1e-05, 'num_tokens': 208300217.0, 'completions/mean_length': 6329.4296875, 'completions/min_length': 160.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 6169.83349609375, 'completions/min_terminated_length': 160.0, 'completions/max_terminated_length': 15331.0, 'rewards/accuracy_reward/mean': 0.4375, 'rewards/accuracy_reward/std': 0.49802759289741516, 'reward': 0.4375, 'reward_std': 0.2580229938030243, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.020002204924821854, 'sampling/sampling_logp_difference/max': 10.749804496765137, 'sampling/importance_sampling_ratio/min': 2.1449603082146496e-05, 'sampling/importance_sampling_ratio/mean': 1.0000567436218262, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9399363100528717, 'clip_ratio/low_mean': 4.1899779091636447e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 2.1777496499453264e-06, 'clip_ratio/high_max': 8.710998599781306e-06, 'clip_ratio/region_mean': 4.407752874158177e-05, 'epoch': 0.24}
+
+ 25%|██▌       | 260/1024 [11:22:11<37:16:52, 175.67s/it][AINFO 12-02 11:35:44 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 11:35:44 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 11:35:44 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 11:35:44 [block_pool.py:292] Successfully reset prefix cache
+[OpenTinker] 2025-12-02 11:37:23,936 - math_verify.grader - WARNING - Timeout during comparison
+
+ 25%|██▌       | 261/1024 [11:24:50<36:11:50, 170.79s/it][A
+                                                         [A{'loss': 0.0667, 'grad_norm': 0.003766207257285714, 'learning_rate': 1e-05, 'num_tokens': 209181077.0, 'completions/mean_length': 6747.90625, 'completions/min_length': 350.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 6594.95263671875, 'completions/min_terminated_length': 350.0, 'completions/max_terminated_length': 15635.0, 'rewards/accuracy_reward/mean': 0.4375, 'rewards/accuracy_reward/std': 0.49802759289741516, 'reward': 0.4375, 'reward_std': 0.3164137303829193, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.020067427307367325, 'sampling/sampling_logp_difference/max': 11.28918743133545, 'sampling/importance_sampling_ratio/min': 1.250743298442103e-05, 'sampling/importance_sampling_ratio/mean': 0.9999313354492188, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9575144425034523, 'clip_ratio/low_mean': 5.959111433639919e-05, 'clip_ratio/low_min': 1.1521060741870315e-05, 'clip_ratio/high_mean': 6.341961125144735e-06, 'clip_ratio/high_max': 2.536784450057894e-05, 'clip_ratio/region_mean': 6.593307591629127e-05, 'epoch': 0.24}
+
+ 25%|██▌       | 261/1024 [11:24:50<36:11:50, 170.79s/it][AINFO 12-02 11:38:23 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 11:38:23 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 11:38:23 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 11:38:23 [block_pool.py:292] Successfully reset prefix cache
+
+ 26%|██▌       | 262/1024 [11:27:31<35:29:33, 167.68s/it][A
+                                                         [A{'loss': 0.0752, 'grad_norm': 0.002375675830990076, 'learning_rate': 1e-05, 'num_tokens': 210023702.0, 'completions/mean_length': 6426.6953125, 'completions/min_length': 767.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 6348.29150390625, 'completions/min_terminated_length': 767.0, 'completions/max_terminated_length': 16218.0, 'rewards/accuracy_reward/mean': 0.5078125, 'rewards/accuracy_reward/std': 0.5019033551216125, 'reward': 0.5078125, 'reward_std': 0.38900789618492126, 'frac_reward_zero_std': 0.125, 'sampling/sampling_logp_difference/mean': 0.018864646553993225, 'sampling/sampling_logp_difference/max': 8.324122428894043, 'sampling/importance_sampling_ratio/min': 0.00024259372730739415, 'sampling/importance_sampling_ratio/mean': 0.9999383687973022, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.87480478733778, 'clip_ratio/low_mean': 5.808068385704246e-05, 'clip_ratio/low_min': 1.0360539818066172e-05, 'clip_ratio/high_mean': 5.156518568583124e-06, 'clip_ratio/high_max': 2.0626074274332495e-05, 'clip_ratio/region_mean': 6.32372018571914e-05, 'epoch': 0.24}
+
+ 26%|██▌       | 262/1024 [11:27:31<35:29:33, 167.68s/it][AINFO 12-02 11:41:03 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 11:41:03 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 11:41:04 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 11:41:04 [block_pool.py:292] Successfully reset prefix cache
+
+ 26%|██▌       | 263/1024 [11:30:16<35:18:10, 167.01s/it][A
+                                                         [A{'loss': 0.0791, 'grad_norm': 0.0016257674433290958, 'learning_rate': 1e-05, 'num_tokens': 210969921.0, 'completions/mean_length': 7223.1484375, 'completions/min_length': 1015.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 6927.63671875, 'completions/min_terminated_length': 1015.0, 'completions/max_terminated_length': 16022.0, 'rewards/accuracy_reward/mean': 0.4609375, 'rewards/accuracy_reward/std': 0.5004304051399231, 'reward': 0.4609375, 'reward_std': 0.2896084189414978, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.02119653858244419, 'sampling/sampling_logp_difference/max': 9.294405937194824, 'sampling/importance_sampling_ratio/min': 9.193710138788447e-05, 'sampling/importance_sampling_ratio/mean': 0.9999514818191528, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.0218688547611237, 'clip_ratio/low_mean': 3.8966268334661436e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 1.1157128483318957e-06, 'clip_ratio/high_max': 4.462851393327583e-06, 'clip_ratio/region_mean': 4.008198141036701e-05, 'epoch': 0.24}
+
+ 26%|██▌       | 263/1024 [11:30:16<35:18:10, 167.01s/it][AINFO 12-02 11:43:49 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 11:43:49 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 11:43:49 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 11:43:49 [block_pool.py:292] Successfully reset prefix cache
+
+ 26%|██▌       | 264/1024 [11:32:48<34:16:22, 162.35s/it][A
+                                                         [A{'loss': 0.0567, 'grad_norm': 0.0023448490537703037, 'learning_rate': 1e-05, 'num_tokens': 211884866.0, 'completions/mean_length': 6974.0703125, 'completions/min_length': 2.0, 'completions/max_length': 15180.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 6974.0703125, 'completions/min_terminated_length': 2.0, 'completions/max_terminated_length': 15180.0, 'rewards/accuracy_reward/mean': 0.390625, 'rewards/accuracy_reward/std': 0.4898075461387634, 'reward': 0.390625, 'reward_std': 0.2885475754737854, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.020384611561894417, 'sampling/sampling_logp_difference/max': 7.172055244445801, 'sampling/importance_sampling_ratio/min': 0.0007677432149648666, 'sampling/importance_sampling_ratio/mean': 1.0000474452972412, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9265539348125458, 'clip_ratio/low_mean': 4.864477250521304e-05, 'clip_ratio/low_min': 8.641252861707471e-06, 'clip_ratio/high_mean': 3.163366102398868e-06, 'clip_ratio/high_max': 1.2653464409595472e-05, 'clip_ratio/region_mean': 5.1808138323394815e-05, 'epoch': 0.24}
+
+ 26%|██▌       | 264/1024 [11:32:48<34:16:22, 162.35s/it][AINFO 12-02 11:46:20 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 11:46:20 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 11:46:20 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 11:46:20 [block_pool.py:292] Successfully reset prefix cache
+
+ 26%|██▌       | 265/1024 [11:35:33<34:27:10, 163.41s/it][A
+                                                         [A{'loss': 0.0484, 'grad_norm': 0.003212807234376669, 'learning_rate': 1e-05, 'num_tokens': 212833933.0, 'completions/mean_length': 7247.2734375, 'completions/min_length': 103.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 7027.9921875, 'completions/min_terminated_length': 103.0, 'completions/max_terminated_length': 15657.0, 'rewards/accuracy_reward/mean': 0.328125, 'rewards/accuracy_reward/std': 0.4713755249977112, 'reward': 0.328125, 'reward_std': 0.2398776412010193, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.0199666079133749, 'sampling/sampling_logp_difference/max': 6.437466144561768, 'sampling/importance_sampling_ratio/min': 0.001600456889718771, 'sampling/importance_sampling_ratio/mean': 0.9999449253082275, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9756898358464241, 'clip_ratio/low_mean': 3.179497366545547e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 2.9919793291810493e-06, 'clip_ratio/high_max': 1.1967917316724197e-05, 'clip_ratio/region_mean': 3.478695157355105e-05, 'epoch': 0.24}
+
+ 26%|██▌       | 265/1024 [11:35:33<34:27:10, 163.41s/it][AINFO 12-02 11:49:06 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 11:49:06 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 11:49:06 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 11:49:06 [block_pool.py:292] Successfully reset prefix cache
+
+ 26%|██▌       | 266/1024 [11:38:40<35:54:00, 170.50s/it][A
+                                                         [A{'loss': 0.0357, 'grad_norm': 0.0019600428640842438, 'learning_rate': 1e-05, 'num_tokens': 213848508.0, 'completions/mean_length': 7773.9296875, 'completions/min_length': 568.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0390625, 'completions/mean_terminated_length': 7423.9267578125, 'completions/min_terminated_length': 568.0, 'completions/max_terminated_length': 15575.0, 'rewards/accuracy_reward/mean': 0.3984375, 'rewards/accuracy_reward/std': 0.4915000796318054, 'reward': 0.3984375, 'reward_std': 0.3129909336566925, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.020565161481499672, 'sampling/sampling_logp_difference/max': 10.624964714050293, 'sampling/importance_sampling_ratio/min': 2.430168751743622e-05, 'sampling/importance_sampling_ratio/mean': 1.0000061988830566, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9765531942248344, 'clip_ratio/low_mean': 4.0242122167910566e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 3.887520392709121e-06, 'clip_ratio/high_max': 1.1404694760130951e-05, 'clip_ratio/region_mean': 4.4129643583801226e-05, 'epoch': 0.24}
+
+ 26%|██▌       | 266/1024 [11:38:40<35:54:00, 170.50s/it][AINFO 12-02 11:52:13 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 11:52:13 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 11:52:13 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 11:52:13 [block_pool.py:292] Successfully reset prefix cache
+[OpenTinker] 2025-12-02 11:53:51,936 - math_verify.grader - WARNING - Timeout during comparison
+
+ 26%|██▌       | 267/1024 [11:41:20<35:08:16, 167.10s/it][A
+                                                         [A{'loss': 0.0235, 'grad_norm': 0.0014550165506079793, 'learning_rate': 1e-05, 'num_tokens': 214731180.0, 'completions/mean_length': 6722.5, 'completions/min_length': 1021.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 6569.14306640625, 'completions/min_terminated_length': 1021.0, 'completions/max_terminated_length': 15583.0, 'rewards/accuracy_reward/mean': 0.4921875, 'rewards/accuracy_reward/std': 0.5019033551216125, 'reward': 0.4921875, 'reward_std': 0.19332444667816162, 'frac_reward_zero_std': 0.5625, 'sampling/sampling_logp_difference/mean': 0.020057080313563347, 'sampling/sampling_logp_difference/max': 4.90623664855957, 'sampling/importance_sampling_ratio/min': 0.007400285452604294, 'sampling/importance_sampling_ratio/mean': 0.9999178647994995, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9291529878973961, 'clip_ratio/low_mean': 2.869901106805628e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 1.6814272498777427e-06, 'clip_ratio/high_max': 6.725708999510971e-06, 'clip_ratio/region_mean': 3.0380438261090603e-05, 'epoch': 0.25}
+
+ 26%|██▌       | 267/1024 [11:41:20<35:08:16, 167.10s/it][AINFO 12-02 11:54:52 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 11:54:53 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 11:54:53 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 11:54:53 [block_pool.py:292] Successfully reset prefix cache
+
+ 26%|██▌       | 268/1024 [11:44:17<35:43:09, 170.09s/it][A
+                                                         [A{'loss': 0.048, 'grad_norm': 0.002483292715623975, 'learning_rate': 1e-05, 'num_tokens': 215645819.0, 'completions/mean_length': 7001.8671875, 'completions/min_length': 930.0, 'completions/max_length': 15797.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 7001.8671875, 'completions/min_terminated_length': 930.0, 'completions/max_terminated_length': 15797.0, 'rewards/accuracy_reward/mean': 0.3515625, 'rewards/accuracy_reward/std': 0.4793342351913452, 'reward': 0.3515625, 'reward_std': 0.32955142855644226, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.020808640867471695, 'sampling/sampling_logp_difference/max': 11.493552207946777, 'sampling/importance_sampling_ratio/min': 1.0195622053288389e-05, 'sampling/importance_sampling_ratio/mean': 0.9999587535858154, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.0746883526444435, 'clip_ratio/low_mean': 3.448591337473772e-05, 'clip_ratio/low_min': 4.687090040533803e-06, 'clip_ratio/high_mean': 6.827749643889547e-06, 'clip_ratio/high_max': 1.8797170469042612e-05, 'clip_ratio/region_mean': 4.1313662677566754e-05, 'epoch': 0.25}
+
+ 26%|██▌       | 268/1024 [11:44:17<35:43:09, 170.09s/it][AINFO 12-02 11:57:50 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 11:57:50 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 11:57:50 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 11:57:50 [block_pool.py:292] Successfully reset prefix cache
+
+ 26%|██▋       | 269/1024 [11:47:00<35:14:35, 168.05s/it][A
+                                                         [A{'loss': 0.0731, 'grad_norm': 0.0015696679474785924, 'learning_rate': 1e-05, 'num_tokens': 216519369.0, 'completions/mean_length': 6663.796875, 'completions/min_length': 1148.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 6509.50830078125, 'completions/min_terminated_length': 1148.0, 'completions/max_terminated_length': 14189.0, 'rewards/accuracy_reward/mean': 0.3671875, 'rewards/accuracy_reward/std': 0.4839322865009308, 'reward': 0.3671875, 'reward_std': 0.3214311897754669, 'frac_reward_zero_std': 0.1875, 'sampling/sampling_logp_difference/mean': 0.022182684391736984, 'sampling/sampling_logp_difference/max': 13.562398910522461, 'sampling/importance_sampling_ratio/min': 1.288027192458685e-06, 'sampling/importance_sampling_ratio/mean': 0.9997707605361938, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.0000900849699974, 'clip_ratio/low_mean': 8.762007928453386e-05, 'clip_ratio/low_min': 2.3698836685071e-05, 'clip_ratio/high_mean': 2.187017230426136e-06, 'clip_ratio/high_max': 8.748068921704544e-06, 'clip_ratio/region_mean': 8.980709480965743e-05, 'epoch': 0.25}
+
+ 26%|██▋       | 269/1024 [11:47:00<35:14:35, 168.05s/it][AINFO 12-02 12:00:33 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 12:00:33 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 12:00:33 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 12:00:33 [block_pool.py:292] Successfully reset prefix cache
+
+ 26%|██▋       | 270/1024 [11:49:35<34:23:00, 164.16s/it][A
+                                                         [A{'loss': 0.1076, 'grad_norm': 0.003887100610882044, 'learning_rate': 1e-05, 'num_tokens': 217432432.0, 'completions/mean_length': 6978.7421875, 'completions/min_length': 1661.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 6829.45263671875, 'completions/min_terminated_length': 1661.0, 'completions/max_terminated_length': 16381.0, 'rewards/accuracy_reward/mean': 0.3671875, 'rewards/accuracy_reward/std': 0.4839322865009308, 'reward': 0.3671875, 'reward_std': 0.3124619722366333, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.02127157337963581, 'sampling/sampling_logp_difference/max': 3.8313302993774414, 'sampling/importance_sampling_ratio/min': 0.02168075367808342, 'sampling/importance_sampling_ratio/mean': 0.9999902248382568, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.0845019966363907, 'clip_ratio/low_mean': 4.567897690321843e-05, 'clip_ratio/low_min': 3.287224444648018e-06, 'clip_ratio/high_mean': 0.0, 'clip_ratio/high_max': 0.0, 'clip_ratio/region_mean': 4.567897690321843e-05, 'epoch': 0.25}
+
+ 26%|██▋       | 270/1024 [11:49:35<34:23:00, 164.16s/it][AINFO 12-02 12:03:08 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 12:03:08 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 12:03:08 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 12:03:08 [block_pool.py:292] Successfully reset prefix cache
+
+ 26%|██▋       | 271/1024 [11:52:13<33:55:50, 162.22s/it][A
+                                                         [A{'loss': 0.0771, 'grad_norm': 0.002851828932762146, 'learning_rate': 1e-05, 'num_tokens': 218208399.0, 'completions/mean_length': 5903.5546875, 'completions/min_length': 651.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 5652.0244140625, 'completions/min_terminated_length': 651.0, 'completions/max_terminated_length': 15638.0, 'rewards/accuracy_reward/mean': 0.4453125, 'rewards/accuracy_reward/std': 0.4989531338214874, 'reward': 0.4453125, 'reward_std': 0.3713914752006531, 'frac_reward_zero_std': 0.1875, 'sampling/sampling_logp_difference/mean': 0.01880766451358795, 'sampling/sampling_logp_difference/max': 7.374999523162842, 'sampling/importance_sampling_ratio/min': 0.000626727007329464, 'sampling/importance_sampling_ratio/mean': 1.0000553131103516, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8638224303722382, 'clip_ratio/low_mean': 5.1527222922231886e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 6.11082077739411e-06, 'clip_ratio/high_max': 2.444328310957644e-05, 'clip_ratio/region_mean': 5.7638043699625996e-05, 'epoch': 0.25}
+
+ 26%|██▋       | 271/1024 [11:52:13<33:55:50, 162.22s/it][AINFO 12-02 12:05:46 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 12:05:46 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 12:05:46 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 12:05:46 [block_pool.py:292] Successfully reset prefix cache
+
+ 27%|██▋       | 272/1024 [11:54:42<33:03:01, 158.22s/it][A
+                                                         [A{'loss': 0.0075, 'grad_norm': 0.0018005800666287541, 'learning_rate': 1e-05, 'num_tokens': 218944418.0, 'completions/mean_length': 5596.7109375, 'completions/min_length': 87.0, 'completions/max_length': 16060.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 5596.7109375, 'completions/min_terminated_length': 87.0, 'completions/max_terminated_length': 16060.0, 'rewards/accuracy_reward/mean': 0.4375, 'rewards/accuracy_reward/std': 0.49802759289741516, 'reward': 0.4375, 'reward_std': 0.29485049843788147, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.02004322223365307, 'sampling/sampling_logp_difference/max': 4.167722702026367, 'sampling/importance_sampling_ratio/min': 0.01548748929053545, 'sampling/importance_sampling_ratio/mean': 1.0000048875808716, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.1127397641539574, 'clip_ratio/low_mean': 2.5821682072546537e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 2.118718214205728e-06, 'clip_ratio/high_max': 8.474872856822913e-06, 'clip_ratio/region_mean': 2.794040096887329e-05, 'epoch': 0.25}
+
+ 27%|██▋       | 272/1024 [11:54:42<33:03:01, 158.22s/it][AINFO 12-02 12:08:15 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 12:08:15 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 12:08:15 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 12:08:15 [block_pool.py:292] Successfully reset prefix cache
+
+ 27%|██▋       | 273/1024 [11:57:32<33:44:32, 161.75s/it][A
+                                                         [A{'loss': 0.0126, 'grad_norm': 0.0022711476776748896, 'learning_rate': 1e-05, 'num_tokens': 219875952.0, 'completions/mean_length': 7120.109375, 'completions/min_length': 816.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 7047.16552734375, 'completions/min_terminated_length': 816.0, 'completions/max_terminated_length': 16362.0, 'rewards/accuracy_reward/mean': 0.2734375, 'rewards/accuracy_reward/std': 0.447474867105484, 'reward': 0.2734375, 'reward_std': 0.23751862347126007, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.02110595628619194, 'sampling/sampling_logp_difference/max': 9.237398147583008, 'sampling/importance_sampling_ratio/min': 9.733050683280453e-05, 'sampling/importance_sampling_ratio/mean': 1.000036358833313, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.0697019025683403, 'clip_ratio/low_mean': 3.539464648838475e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 4.925485768580984e-06, 'clip_ratio/high_max': 1.5034628631838132e-05, 'clip_ratio/region_mean': 4.032013237065257e-05, 'epoch': 0.25}
+
+ 27%|██▋       | 273/1024 [11:57:32<33:44:32, 161.75s/it][AINFO 12-02 12:11:04 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 12:11:05 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 12:11:05 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 12:11:05 [block_pool.py:292] Successfully reset prefix cache
+
+ 27%|██▋       | 274/1024 [12:00:18<33:58:36, 163.09s/it][A
+                                                         [A{'loss': 0.0563, 'grad_norm': 0.002609838731586933, 'learning_rate': 1e-05, 'num_tokens': 220871730.0, 'completions/mean_length': 7623.953125, 'completions/min_length': 372.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 7484.9052734375, 'completions/min_terminated_length': 372.0, 'completions/max_terminated_length': 15745.0, 'rewards/accuracy_reward/mean': 0.3046875, 'rewards/accuracy_reward/std': 0.46208351850509644, 'reward': 0.3046875, 'reward_std': 0.30061954259872437, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.019322458654642105, 'sampling/sampling_logp_difference/max': 6.472845554351807, 'sampling/importance_sampling_ratio/min': 0.0015448236372321844, 'sampling/importance_sampling_ratio/mean': 0.9999877214431763, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8836525157094002, 'clip_ratio/low_mean': 3.796903268948881e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 2.6396170369480387e-06, 'clip_ratio/high_max': 1.0558468147792155e-05, 'clip_ratio/region_mean': 4.060864915800266e-05, 'epoch': 0.25}
+
+ 27%|██▋       | 274/1024 [12:00:18<33:58:36, 163.09s/it][AINFO 12-02 12:13:51 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 12:13:51 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 12:13:51 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 12:13:51 [block_pool.py:292] Successfully reset prefix cache
+
+ 27%|██▋       | 275/1024 [12:02:46<32:59:00, 158.53s/it][A
+                                                         [A{'loss': 0.0538, 'grad_norm': 0.00395589042454958, 'learning_rate': 1e-05, 'num_tokens': 221761214.0, 'completions/mean_length': 6778.71875, 'completions/min_length': 1187.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 6703.08642578125, 'completions/min_terminated_length': 1187.0, 'completions/max_terminated_length': 15557.0, 'rewards/accuracy_reward/mean': 0.4921875, 'rewards/accuracy_reward/std': 0.5019033551216125, 'reward': 0.4921875, 'reward_std': 0.4032142758369446, 'frac_reward_zero_std': 0.0625, 'sampling/sampling_logp_difference/mean': 0.018937086686491966, 'sampling/sampling_logp_difference/max': 6.7486371994018555, 'sampling/importance_sampling_ratio/min': 0.0011724763317033648, 'sampling/importance_sampling_ratio/mean': 1.0000214576721191, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8968989998102188, 'clip_ratio/low_mean': 5.795533934360719e-05, 'clip_ratio/low_min': 4.49300887339632e-06, 'clip_ratio/high_mean': 2.86196302567987e-06, 'clip_ratio/high_max': 1.144785210271948e-05, 'clip_ratio/region_mean': 6.081730361984228e-05, 'epoch': 0.25}
+
+ 27%|██▋       | 275/1024 [12:02:46<32:59:00, 158.53s/it][AINFO 12-02 12:16:19 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 12:16:19 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 12:16:19 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 12:16:19 [block_pool.py:292] Successfully reset prefix cache
+
+ 27%|██▋       | 276/1024 [12:06:03<35:22:34, 170.26s/it][A
+                                                         [A{'loss': 0.0271, 'grad_norm': 0.002063714899122715, 'learning_rate': 1e-05, 'num_tokens': 222719287.0, 'completions/mean_length': 7319.2578125, 'completions/min_length': 1034.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0546875, 'completions/mean_terminated_length': 6794.85107421875, 'completions/min_terminated_length': 1034.0, 'completions/max_terminated_length': 16100.0, 'rewards/accuracy_reward/mean': 0.3203125, 'rewards/accuracy_reward/std': 0.4684300124645233, 'reward': 0.3203125, 'reward_std': 0.2835301160812378, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.019336167722940445, 'sampling/sampling_logp_difference/max': 10.7538423538208, 'sampling/importance_sampling_ratio/min': 2.13631665246794e-05, 'sampling/importance_sampling_ratio/mean': 0.9999525547027588, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.870811752974987, 'clip_ratio/low_mean': 3.0297362627607072e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 7.628764933542698e-06, 'clip_ratio/high_max': 2.708495139813749e-05, 'clip_ratio/region_mean': 3.792612744746293e-05, 'epoch': 0.25}
+
+ 27%|██▋       | 276/1024 [12:06:03<35:22:34, 170.26s/it][AINFO 12-02 12:19:36 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 12:19:36 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 12:19:36 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 12:19:36 [block_pool.py:292] Successfully reset prefix cache
+
+ 27%|██▋       | 277/1024 [12:08:46<34:50:43, 167.93s/it][A
+                                                         [A{'loss': 0.0696, 'grad_norm': 0.0023463829420506954, 'learning_rate': 1e-05, 'num_tokens': 223533372.0, 'completions/mean_length': 6207.4140625, 'completions/min_length': 752.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 5879.13671875, 'completions/min_terminated_length': 752.0, 'completions/max_terminated_length': 16326.0, 'rewards/accuracy_reward/mean': 0.4375, 'rewards/accuracy_reward/std': 0.49802759289741516, 'reward': 0.4375, 'reward_std': 0.2359210103750229, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.018392907455563545, 'sampling/sampling_logp_difference/max': 10.749905586242676, 'sampling/importance_sampling_ratio/min': 2.1447433027788065e-05, 'sampling/importance_sampling_ratio/mean': 1.0000433921813965, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8348869979381561, 'clip_ratio/low_mean': 2.303871349340625e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 9.650721040088683e-07, 'clip_ratio/high_max': 3.860288416035473e-06, 'clip_ratio/region_mean': 2.4003785597415117e-05, 'epoch': 0.25}
+
+ 27%|██▋       | 277/1024 [12:08:46<34:50:43, 167.93s/it][AINFO 12-02 12:22:19 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 12:22:19 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 12:22:19 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 12:22:19 [block_pool.py:292] Successfully reset prefix cache
+
+ 27%|██▋       | 278/1024 [12:11:10<33:20:53, 160.93s/it][A
+                                                         [A{'loss': 0.034, 'grad_norm': 0.0033652919810265303, 'learning_rate': 1e-05, 'num_tokens': 224375711.0, 'completions/mean_length': 6417.2109375, 'completions/min_length': 981.0, 'completions/max_length': 14713.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 6417.2109375, 'completions/min_terminated_length': 981.0, 'completions/max_terminated_length': 14713.0, 'rewards/accuracy_reward/mean': 0.390625, 'rewards/accuracy_reward/std': 0.4898075461387634, 'reward': 0.390625, 'reward_std': 0.3169426918029785, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.020980924367904663, 'sampling/sampling_logp_difference/max': 6.18701171875, 'sampling/importance_sampling_ratio/min': 0.0020559614058583975, 'sampling/importance_sampling_ratio/mean': 0.9999820590019226, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.0232173576951027, 'clip_ratio/low_mean': 5.504566888703266e-05, 'clip_ratio/low_min': 1.2581466762640048e-05, 'clip_ratio/high_mean': 5.360416366784193e-06, 'clip_ratio/high_max': 2.1441665467136772e-05, 'clip_ratio/region_mean': 6.040608514013002e-05, 'epoch': 0.26}
+
+ 27%|██▋       | 278/1024 [12:11:10<33:20:53, 160.93s/it][AINFO 12-02 12:24:43 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 12:24:43 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 12:24:43 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 12:24:43 [block_pool.py:292] Successfully reset prefix cache
+
+ 27%|██▋       | 279/1024 [12:13:30<31:58:52, 154.54s/it][A
+                                                         [A{'loss': 0.0813, 'grad_norm': 0.0025473968125879765, 'learning_rate': 1e-05, 'num_tokens': 225070992.0, 'completions/mean_length': 5275.9453125, 'completions/min_length': 473.0, 'completions/max_length': 15026.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 5275.9453125, 'completions/min_terminated_length': 473.0, 'completions/max_terminated_length': 15026.0, 'rewards/accuracy_reward/mean': 0.703125, 'rewards/accuracy_reward/std': 0.45867621898651123, 'reward': 0.703125, 'reward_std': 0.2790592610836029, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.018068701028823853, 'sampling/sampling_logp_difference/max': 6.906133651733398, 'sampling/importance_sampling_ratio/min': 0.0010016229934990406, 'sampling/importance_sampling_ratio/mean': 0.9999873042106628, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8563915193080902, 'clip_ratio/low_mean': 2.818696702888701e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 1.1698862181219738e-06, 'clip_ratio/high_max': 4.679544872487895e-06, 'clip_ratio/region_mean': 2.9356853247008985e-05, 'epoch': 0.26}
+
+ 27%|██▋       | 279/1024 [12:13:30<31:58:52, 154.54s/it][AINFO 12-02 12:27:03 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 12:27:03 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 12:27:03 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 12:27:03 [block_pool.py:292] Successfully reset prefix cache
+
+ 27%|██▋       | 280/1024 [12:16:26<33:17:21, 161.08s/it][A
+                                                         [A{'loss': 0.0836, 'grad_norm': 0.0016663498245179653, 'learning_rate': 1e-05, 'num_tokens': 226073822.0, 'completions/mean_length': 7693.984375, 'completions/min_length': 1349.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 7556.0478515625, 'completions/min_terminated_length': 1349.0, 'completions/max_terminated_length': 16350.0, 'rewards/accuracy_reward/mean': 0.421875, 'rewards/accuracy_reward/std': 0.4957992732524872, 'reward': 0.421875, 'reward_std': 0.3227166533470154, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.01851016655564308, 'sampling/sampling_logp_difference/max': 12.04162311553955, 'sampling/importance_sampling_ratio/min': 5.893720299354754e-06, 'sampling/importance_sampling_ratio/mean': 0.9999273419380188, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.7832933664321899, 'clip_ratio/low_mean': 4.1973035422415705e-05, 'clip_ratio/low_min': 6.267234766710317e-06, 'clip_ratio/high_mean': 0.0, 'clip_ratio/high_max': 0.0, 'clip_ratio/region_mean': 4.1973035422415705e-05, 'epoch': 0.26}
+
+ 27%|██▋       | 280/1024 [12:16:26<33:17:21, 161.08s/it][AINFO 12-02 12:29:59 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 12:29:59 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 12:29:59 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 12:29:59 [block_pool.py:292] Successfully reset prefix cache
+
+ 27%|██▋       | 281/1024 [12:19:03<32:57:37, 159.70s/it][A
+                                                         [A{'loss': 0.055, 'grad_norm': 0.0021045261528342962, 'learning_rate': 1e-05, 'num_tokens': 226888577.0, 'completions/mean_length': 6222.4609375, 'completions/min_length': 967.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 6061.1669921875, 'completions/min_terminated_length': 967.0, 'completions/max_terminated_length': 15207.0, 'rewards/accuracy_reward/mean': 0.5078125, 'rewards/accuracy_reward/std': 0.5019033551216125, 'reward': 0.5078125, 'reward_std': 0.2767002284526825, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.018851105123758316, 'sampling/sampling_logp_difference/max': 14.379725456237793, 'sampling/importance_sampling_ratio/min': 5.688065698450373e-07, 'sampling/importance_sampling_ratio/mean': 0.9999616742134094, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8835120126605034, 'clip_ratio/low_mean': 3.7096169648975774e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 3.262004156567855e-06, 'clip_ratio/high_max': 1.304801662627142e-05, 'clip_ratio/region_mean': 4.035817426029098e-05, 'epoch': 0.26}
+
+ 27%|██▋       | 281/1024 [12:19:03<32:57:37, 159.70s/it][AINFO 12-02 12:32:36 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 12:32:36 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 12:32:36 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 12:32:36 [block_pool.py:292] Successfully reset prefix cache
+
+ 28%|██▊       | 282/1024 [12:21:52<33:29:25, 162.49s/it][A
+                                                         [A{'loss': 0.1009, 'grad_norm': 0.0030448357574641705, 'learning_rate': 1e-05, 'num_tokens': 227722025.0, 'completions/mean_length': 6371.625, 'completions/min_length': 1034.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 6131.328125, 'completions/min_terminated_length': 1034.0, 'completions/max_terminated_length': 16331.0, 'rewards/accuracy_reward/mean': 0.515625, 'rewards/accuracy_reward/std': 0.5017194747924805, 'reward': 0.515625, 'reward_std': 0.2722293734550476, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.018757576122879982, 'sampling/sampling_logp_difference/max': 8.687409400939941, 'sampling/importance_sampling_ratio/min': 0.00016869648243300617, 'sampling/importance_sampling_ratio/mean': 0.9999712705612183, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9026313945651054, 'clip_ratio/low_mean': 3.1754828114571865e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 0.0, 'clip_ratio/high_max': 0.0, 'clip_ratio/region_mean': 3.1754828114571865e-05, 'epoch': 0.26}
+
+ 28%|██▊       | 282/1024 [12:21:52<33:29:25, 162.49s/it][AINFO 12-02 12:35:25 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 12:35:25 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 12:35:25 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 12:35:25 [block_pool.py:292] Successfully reset prefix cache
+
+ 28%|██▊       | 283/1024 [12:24:39<33:45:20, 163.99s/it][A
+                                                         [A{'loss': 0.0513, 'grad_norm': 0.0019295766251161695, 'learning_rate': 1e-05, 'num_tokens': 228703256.0, 'completions/mean_length': 7510.4921875, 'completions/min_length': 159.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 7224.25, 'completions/min_terminated_length': 159.0, 'completions/max_terminated_length': 15891.0, 'rewards/accuracy_reward/mean': 0.3046875, 'rewards/accuracy_reward/std': 0.46208351850509644, 'reward': 0.3046875, 'reward_std': 0.19674429297447205, 'frac_reward_zero_std': 0.5625, 'sampling/sampling_logp_difference/mean': 0.02207346074283123, 'sampling/sampling_logp_difference/max': 8.428196907043457, 'sampling/importance_sampling_ratio/min': 0.0002186153142247349, 'sampling/importance_sampling_ratio/mean': 0.9999972581863403, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.044313833117485, 'clip_ratio/low_mean': 3.379111592494155e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 1.7560213905198907e-06, 'clip_ratio/high_max': 7.024085562079563e-06, 'clip_ratio/region_mean': 3.5547137599678535e-05, 'epoch': 0.26}
+
+ 28%|██▊       | 283/1024 [12:24:39<33:45:20, 163.99s/it][AINFO 12-02 12:38:12 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 12:38:12 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 12:38:12 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 12:38:12 [block_pool.py:292] Successfully reset prefix cache
+
+ 28%|██▊       | 284/1024 [12:27:27<33:54:31, 164.96s/it][A
+                                                         [A{'loss': 0.0748, 'grad_norm': 0.0017117204843088984, 'learning_rate': 1e-05, 'num_tokens': 229697002.0, 'completions/mean_length': 7594.140625, 'completions/min_length': 598.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 7524.92919921875, 'completions/min_terminated_length': 598.0, 'completions/max_terminated_length': 16007.0, 'rewards/accuracy_reward/mean': 0.2734375, 'rewards/accuracy_reward/std': 0.447474867105484, 'reward': 0.2734375, 'reward_std': 0.18649455904960632, 'frac_reward_zero_std': 0.5625, 'sampling/sampling_logp_difference/mean': 0.021097885444760323, 'sampling/sampling_logp_difference/max': 7.946208953857422, 'sampling/importance_sampling_ratio/min': 0.00035400164779275656, 'sampling/importance_sampling_ratio/mean': 1.000016212463379, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9706612005829811, 'clip_ratio/low_mean': 3.7797102550030104e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 1.2670802789216395e-06, 'clip_ratio/high_max': 5.068321115686558e-06, 'clip_ratio/region_mean': 3.9064182828951743e-05, 'epoch': 0.26}
+
+ 28%|██▊       | 284/1024 [12:27:27<33:54:31, 164.96s/it][AINFO 12-02 12:40:59 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 12:40:59 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 12:41:00 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 12:41:00 [block_pool.py:292] Successfully reset prefix cache
+
+ 28%|██▊       | 285/1024 [12:30:12<33:52:43, 165.04s/it][A
+                                                         [A{'loss': 0.0887, 'grad_norm': 0.0025933689903467894, 'learning_rate': 1e-05, 'num_tokens': 230598679.0, 'completions/mean_length': 6888.9140625, 'completions/min_length': 327.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 6738.19873046875, 'completions/min_terminated_length': 327.0, 'completions/max_terminated_length': 15562.0, 'rewards/accuracy_reward/mean': 0.4375, 'rewards/accuracy_reward/std': 0.49802759289741516, 'reward': 0.4375, 'reward_std': 0.2580229938030243, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.01939917542040348, 'sampling/sampling_logp_difference/max': 7.221237659454346, 'sampling/importance_sampling_ratio/min': 0.0007308972999453545, 'sampling/importance_sampling_ratio/mean': 0.9999586939811707, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9210037142038345, 'clip_ratio/low_mean': 4.570582996166195e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 3.904650384356501e-06, 'clip_ratio/high_max': 1.5618601537426002e-05, 'clip_ratio/region_mean': 4.961048034601845e-05, 'epoch': 0.26}
+
+ 28%|██▊       | 285/1024 [12:30:12<33:52:43, 165.04s/it][AINFO 12-02 12:43:45 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 12:43:45 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 12:43:45 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 12:43:45 [block_pool.py:292] Successfully reset prefix cache
+
+ 28%|██▊       | 286/1024 [12:32:50<33:26:11, 163.11s/it][A
+                                                         [A{'loss': 0.0841, 'grad_norm': 0.0019397985888645053, 'learning_rate': 1e-05, 'num_tokens': 231440153.0, 'completions/mean_length': 6433.640625, 'completions/min_length': 213.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 6355.29150390625, 'completions/min_terminated_length': 213.0, 'completions/max_terminated_length': 15458.0, 'rewards/accuracy_reward/mean': 0.375, 'rewards/accuracy_reward/std': 0.4860251843929291, 'reward': 0.375, 'reward_std': 0.3451131582260132, 'frac_reward_zero_std': 0.1875, 'sampling/sampling_logp_difference/mean': 0.021084938198328018, 'sampling/sampling_logp_difference/max': 3.961216926574707, 'sampling/importance_sampling_ratio/min': 0.019039930775761604, 'sampling/importance_sampling_ratio/mean': 0.9999503493309021, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.064419962465763, 'clip_ratio/low_mean': 4.821338916372042e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 6.9283565835576155e-06, 'clip_ratio/high_max': 2.398964193162101e-05, 'clip_ratio/region_mean': 5.514174608833855e-05, 'epoch': 0.26}
+
+ 28%|██▊       | 286/1024 [12:32:50<33:26:11, 163.11s/it][AINFO 12-02 12:46:23 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 12:46:23 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 12:46:23 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 12:46:23 [block_pool.py:292] Successfully reset prefix cache
+
+ 28%|██▊       | 287/1024 [12:35:24<32:47:27, 160.17s/it][A
+                                                         [A{'loss': 0.0408, 'grad_norm': 0.002288782736286521, 'learning_rate': 1e-05, 'num_tokens': 232209485.0, 'completions/mean_length': 5858.40625, 'completions/min_length': 546.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 5691.33349609375, 'completions/min_terminated_length': 546.0, 'completions/max_terminated_length': 14304.0, 'rewards/accuracy_reward/mean': 0.46875, 'rewards/accuracy_reward/std': 0.5009832978248596, 'reward': 0.46875, 'reward_std': 0.36637401580810547, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.019076552242040634, 'sampling/sampling_logp_difference/max': 8.624787330627441, 'sampling/importance_sampling_ratio/min': 0.00017959839897230268, 'sampling/importance_sampling_ratio/mean': 0.9999239444732666, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8120778575539589, 'clip_ratio/low_mean': 5.4512621773028513e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 6.997284344834043e-06, 'clip_ratio/high_max': 1.9223051822336856e-05, 'clip_ratio/region_mean': 6.150990611786256e-05, 'epoch': 0.26}
+
+ 28%|██▊       | 287/1024 [12:35:24<32:47:27, 160.17s/it][AINFO 12-02 12:48:57 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 12:48:57 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 12:48:57 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 12:48:57 [block_pool.py:292] Successfully reset prefix cache
+
+ 28%|██▊       | 288/1024 [12:38:35<34:38:59, 169.48s/it][A
+                                                         [A{'loss': 0.0364, 'grad_norm': 0.001075367210432887, 'learning_rate': 1e-05, 'num_tokens': 233133850.0, 'completions/mean_length': 7088.4765625, 'completions/min_length': 688.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0390625, 'completions/mean_terminated_length': 6710.609375, 'completions/min_terminated_length': 688.0, 'completions/max_terminated_length': 15825.0, 'rewards/accuracy_reward/mean': 0.5078125, 'rewards/accuracy_reward/std': 0.5019033551216125, 'reward': 0.5078125, 'reward_std': 0.18383610248565674, 'frac_reward_zero_std': 0.625, 'sampling/sampling_logp_difference/mean': 0.019140273332595825, 'sampling/sampling_logp_difference/max': 5.248020648956299, 'sampling/importance_sampling_ratio/min': 0.005257915705442429, 'sampling/importance_sampling_ratio/mean': 0.9998681545257568, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9231890514492989, 'clip_ratio/low_mean': 1.8137742017643177e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 3.4680233511608094e-06, 'clip_ratio/high_max': 9.900939403451048e-06, 'clip_ratio/region_mean': 2.1605765368803986e-05, 'epoch': 0.26}
+
+ 28%|██▊       | 288/1024 [12:38:35<34:38:59, 169.48s/it][AINFO 12-02 12:52:08 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 12:52:08 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 12:52:08 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 12:52:08 [block_pool.py:292] Successfully reset prefix cache
+
+ 28%|██▊       | 289/1024 [12:41:22<34:28:57, 168.89s/it][A
+                                                         [A{'loss': 0.0828, 'grad_norm': 0.0015517349820584059, 'learning_rate': 1e-05, 'num_tokens': 233940718.0, 'completions/mean_length': 6151.78125, 'completions/min_length': 772.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 5906.20849609375, 'completions/min_terminated_length': 772.0, 'completions/max_terminated_length': 16384.0, 'rewards/accuracy_reward/mean': 0.46875, 'rewards/accuracy_reward/std': 0.5009832978248596, 'reward': 0.46875, 'reward_std': 0.21884137392044067, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.019276250153779984, 'sampling/sampling_logp_difference/max': 9.482501983642578, 'sampling/importance_sampling_ratio/min': 7.617311348440126e-05, 'sampling/importance_sampling_ratio/mean': 1.0000255107879639, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8585417941212654, 'clip_ratio/low_mean': 1.838804723774956e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 2.1621142423100537e-06, 'clip_ratio/high_max': 8.648456969240215e-06, 'clip_ratio/region_mean': 2.0550161480059614e-05, 'epoch': 0.27}
+
+ 28%|██▊       | 289/1024 [12:41:22<34:28:57, 168.89s/it][AINFO 12-02 12:54:55 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 12:54:55 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 12:54:55 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 12:54:55 [block_pool.py:292] Successfully reset prefix cache
+
+ 28%|██▊       | 290/1024 [12:44:23<35:08:50, 172.38s/it][A
+                                                         [A{'loss': 0.0642, 'grad_norm': 0.0036829947493970394, 'learning_rate': 1e-05, 'num_tokens': 234872111.0, 'completions/mean_length': 7111.2578125, 'completions/min_length': 35.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 6812.13671875, 'completions/min_terminated_length': 35.0, 'completions/max_terminated_length': 15133.0, 'rewards/accuracy_reward/mean': 0.4296875, 'rewards/accuracy_reward/std': 0.4969765841960907, 'reward': 0.4296875, 'reward_std': 0.31930169463157654, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.021356046199798584, 'sampling/sampling_logp_difference/max': 7.333044528961182, 'sampling/importance_sampling_ratio/min': 0.0006535807042382658, 'sampling/importance_sampling_ratio/mean': 0.9999943971633911, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9735362678766251, 'clip_ratio/low_mean': 2.1342358195397537e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 3.7661499732166703e-06, 'clip_ratio/high_max': 1.1416668485253467e-05, 'clip_ratio/region_mean': 2.5108507770710276e-05, 'epoch': 0.27}
+
+ 28%|██▊       | 290/1024 [12:44:23<35:08:50, 172.38s/it][AINFO 12-02 12:57:56 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 12:57:56 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 12:57:56 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 12:57:56 [block_pool.py:292] Successfully reset prefix cache
+
+ 28%|██▊       | 291/1024 [12:47:17<35:10:49, 172.78s/it][A
+                                                         [A{'loss': 0.0169, 'grad_norm': 0.00234629912301898, 'learning_rate': 1e-05, 'num_tokens': 235759149.0, 'completions/mean_length': 6787.671875, 'completions/min_length': 1404.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 6478.11279296875, 'completions/min_terminated_length': 1404.0, 'completions/max_terminated_length': 15995.0, 'rewards/accuracy_reward/mean': 0.5390625, 'rewards/accuracy_reward/std': 0.5004304051399231, 'reward': 0.5390625, 'reward_std': 0.25354230403900146, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.01950475014746189, 'sampling/sampling_logp_difference/max': 7.937491416931152, 'sampling/importance_sampling_ratio/min': 0.00035710117663256824, 'sampling/importance_sampling_ratio/mean': 0.9999296069145203, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8856986835598946, 'clip_ratio/low_mean': 3.30086276107977e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 5.631501267089334e-06, 'clip_ratio/high_max': 2.2526005068357335e-05, 'clip_ratio/region_mean': 3.864012808207917e-05, 'epoch': 0.27}
+
+ 28%|██▊       | 291/1024 [12:47:17<35:10:49, 172.78s/it][AINFO 12-02 13:00:50 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 13:00:50 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 13:00:50 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 13:00:50 [block_pool.py:292] Successfully reset prefix cache
+
+ 29%|██▊       | 292/1024 [12:49:56<34:18:02, 168.69s/it][A
+                                                         [A{'loss': 0.0527, 'grad_norm': 0.0034125701058655977, 'learning_rate': 1e-05, 'num_tokens': 236643319.0, 'completions/mean_length': 6757.203125, 'completions/min_length': 236.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 6604.39697265625, 'completions/min_terminated_length': 236.0, 'completions/max_terminated_length': 15845.0, 'rewards/accuracy_reward/mean': 0.3515625, 'rewards/accuracy_reward/std': 0.4793342351913452, 'reward': 0.3515625, 'reward_std': 0.2896084189414978, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.020774487406015396, 'sampling/sampling_logp_difference/max': 11.999980926513672, 'sampling/importance_sampling_ratio/min': 6.144329745438881e-06, 'sampling/importance_sampling_ratio/mean': 0.9999240636825562, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9217840805649757, 'clip_ratio/low_mean': 4.603358706845029e-05, 'clip_ratio/low_min': 4.53654638477019e-06, 'clip_ratio/high_mean': 6.5063205170190486e-06, 'clip_ratio/high_max': 2.6025282068076194e-05, 'clip_ratio/region_mean': 5.253990843812062e-05, 'epoch': 0.27}
+
+ 29%|██▊       | 292/1024 [12:49:56<34:18:02, 168.69s/it][AINFO 12-02 13:03:29 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 13:03:29 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 13:03:29 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 13:03:29 [block_pool.py:292] Successfully reset prefix cache
+
+ 29%|██▊       | 293/1024 [12:52:49<34:32:53, 170.14s/it][A
+                                                         [A{'loss': 0.0464, 'grad_norm': 0.0020905097480863333, 'learning_rate': 1e-05, 'num_tokens': 237495351.0, 'completions/mean_length': 6504.4375, 'completions/min_length': 516.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 6185.74169921875, 'completions/min_terminated_length': 516.0, 'completions/max_terminated_length': 16319.0, 'rewards/accuracy_reward/mean': 0.25, 'rewards/accuracy_reward/std': 0.434714138507843, 'reward': 0.25, 'reward_std': 0.30904704332351685, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.023218728601932526, 'sampling/sampling_logp_difference/max': 6.913712024688721, 'sampling/importance_sampling_ratio/min': 0.0009940610034391284, 'sampling/importance_sampling_ratio/mean': 1.0000053644180298, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.126970261335373, 'clip_ratio/low_mean': 3.5210429246035346e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 0.0, 'clip_ratio/high_max': 0.0, 'clip_ratio/region_mean': 3.5210429246035346e-05, 'epoch': 0.27}
+
+ 29%|██▊       | 293/1024 [12:52:49<34:32:53, 170.14s/it][AINFO 12-02 13:06:22 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 13:06:22 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 13:06:22 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 13:06:22 [block_pool.py:292] Successfully reset prefix cache
+
+ 29%|██▊       | 294/1024 [12:55:36<34:18:39, 169.21s/it][A
+                                                         [A{'loss': 0.0066, 'grad_norm': 0.002073790645226836, 'learning_rate': 1e-05, 'num_tokens': 238251852.0, 'completions/mean_length': 5702.4140625, 'completions/min_length': 867.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 5446.05615234375, 'completions/min_terminated_length': 867.0, 'completions/max_terminated_length': 15888.0, 'rewards/accuracy_reward/mean': 0.5625, 'rewards/accuracy_reward/std': 0.49802759289741516, 'reward': 0.5625, 'reward_std': 0.2022808939218521, 'frac_reward_zero_std': 0.5625, 'sampling/sampling_logp_difference/mean': 0.0185186006128788, 'sampling/sampling_logp_difference/max': 4.117175579071045, 'sampling/importance_sampling_ratio/min': 0.016290459781885147, 'sampling/importance_sampling_ratio/mean': 1.000054955482483, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8803137242794037, 'clip_ratio/low_mean': 1.5693222053414502e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 0.0, 'clip_ratio/high_max': 0.0, 'clip_ratio/region_mean': 1.5693222053414502e-05, 'epoch': 0.27}
+
+ 29%|██▊       | 294/1024 [12:55:36<34:18:39, 169.21s/it][AINFO 12-02 13:09:09 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 13:09:09 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 13:09:09 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 13:09:09 [block_pool.py:292] Successfully reset prefix cache
+
+ 29%|██▉       | 295/1024 [12:58:23<34:04:34, 168.28s/it][A
+                                                         [A{'loss': 0.0908, 'grad_norm': 0.002781527815386653, 'learning_rate': 1e-05, 'num_tokens': 239189385.0, 'completions/mean_length': 7176.2890625, 'completions/min_length': 654.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0390625, 'completions/mean_terminated_length': 6801.99169921875, 'completions/min_terminated_length': 654.0, 'completions/max_terminated_length': 15915.0, 'rewards/accuracy_reward/mean': 0.5078125, 'rewards/accuracy_reward/std': 0.5019033551216125, 'reward': 0.5078125, 'reward_std': 0.3634958863258362, 'frac_reward_zero_std': 0.1875, 'sampling/sampling_logp_difference/mean': 0.020436719059944153, 'sampling/sampling_logp_difference/max': 6.281210422515869, 'sampling/importance_sampling_ratio/min': 0.0018711343873292208, 'sampling/importance_sampling_ratio/mean': 0.9999794960021973, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9554997384548187, 'clip_ratio/low_mean': 2.979715202400257e-05, 'clip_ratio/low_min': 4.1597336348786484e-06, 'clip_ratio/high_mean': 4.4483959982244414e-06, 'clip_ratio/high_max': 1.4213665508577833e-05, 'clip_ratio/region_mean': 3.424554824960069e-05, 'epoch': 0.27}
+
+ 29%|██▉       | 295/1024 [12:58:23<34:04:34, 168.28s/it][AINFO 12-02 13:11:55 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 13:11:55 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 13:11:55 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 13:11:55 [block_pool.py:292] Successfully reset prefix cache
+
+ 29%|██▉       | 296/1024 [13:01:17<34:23:05, 170.04s/it][A
+                                                         [A{'loss': 0.0311, 'grad_norm': 0.003132987068966031, 'learning_rate': 1e-05, 'num_tokens': 240217715.0, 'completions/mean_length': 7855.578125, 'completions/min_length': 688.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 7651.2001953125, 'completions/min_terminated_length': 688.0, 'completions/max_terminated_length': 16020.0, 'rewards/accuracy_reward/mean': 0.40625, 'rewards/accuracy_reward/std': 0.4930621087551117, 'reward': 0.40625, 'reward_std': 0.28512775897979736, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.021461743861436844, 'sampling/sampling_logp_difference/max': 6.773357391357422, 'sampling/importance_sampling_ratio/min': 0.0011438478250056505, 'sampling/importance_sampling_ratio/mean': 0.9999253153800964, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9450526610016823, 'clip_ratio/low_mean': 4.8968343890010146e-05, 'clip_ratio/low_min': 4.0222671486844774e-06, 'clip_ratio/high_mean': 5.171368570699997e-06, 'clip_ratio/high_max': 1.2612186310434481e-05, 'clip_ratio/region_mean': 5.413971166490228e-05, 'epoch': 0.27}
+
+ 29%|██▉       | 296/1024 [13:01:17<34:23:05, 170.04s/it][AINFO 12-02 13:14:50 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 13:14:50 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 13:14:50 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 13:14:50 [block_pool.py:292] Successfully reset prefix cache
+
+ 29%|██▉       | 297/1024 [13:03:59<33:53:58, 167.87s/it][A
+                                                         [A{'loss': 0.015, 'grad_norm': 0.0028523094952106476, 'learning_rate': 1e-05, 'num_tokens': 241035133.0, 'completions/mean_length': 6240.265625, 'completions/min_length': 421.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 5913.04833984375, 'completions/min_terminated_length': 421.0, 'completions/max_terminated_length': 14060.0, 'rewards/accuracy_reward/mean': 0.484375, 'rewards/accuracy_reward/std': 0.5017194747924805, 'reward': 0.484375, 'reward_std': 0.26143303513526917, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.019493088126182556, 'sampling/sampling_logp_difference/max': 7.274198532104492, 'sampling/importance_sampling_ratio/min': 0.0006931954412721097, 'sampling/importance_sampling_ratio/mean': 1.000007152557373, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8811023011803627, 'clip_ratio/low_mean': 3.516899266742257e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 6.823271291978017e-06, 'clip_ratio/high_max': 2.172341964978841e-05, 'clip_ratio/region_mean': 4.199226441414794e-05, 'epoch': 0.27}
+
+ 29%|██▉       | 297/1024 [13:03:59<33:53:58, 167.87s/it][AINFO 12-02 13:17:32 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 13:17:32 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 13:17:32 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 13:17:32 [block_pool.py:292] Successfully reset prefix cache
+
+ 29%|██▉       | 298/1024 [13:06:43<33:36:12, 166.63s/it][A
+                                                         [A{'loss': 0.0089, 'grad_norm': 0.0026241440791636705, 'learning_rate': 1e-05, 'num_tokens': 241836479.0, 'completions/mean_length': 6103.203125, 'completions/min_length': 355.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 6022.251953125, 'completions/min_terminated_length': 355.0, 'completions/max_terminated_length': 15795.0, 'rewards/accuracy_reward/mean': 0.453125, 'rewards/accuracy_reward/std': 0.4997538626194, 'reward': 0.453125, 'reward_std': 0.32589423656463623, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.01907728984951973, 'sampling/sampling_logp_difference/max': 8.974145889282227, 'sampling/importance_sampling_ratio/min': 0.00012664205860346556, 'sampling/importance_sampling_ratio/mean': 0.999925434589386, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8766692876815796, 'clip_ratio/low_mean': 3.768150395444536e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 3.151650389554561e-06, 'clip_ratio/high_max': 1.2606601558218244e-05, 'clip_ratio/region_mean': 4.08331545713736e-05, 'epoch': 0.27}
+
+ 29%|██▉       | 298/1024 [13:06:43<33:36:12, 166.63s/it][AINFO 12-02 13:20:16 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 13:20:16 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 13:20:16 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 13:20:16 [block_pool.py:292] Successfully reset prefix cache
+
+ 29%|██▉       | 299/1024 [13:09:39<34:05:26, 169.28s/it][A
+                                                         [A{'loss': 0.0443, 'grad_norm': 0.003654222236946225, 'learning_rate': 1e-05, 'num_tokens': 242844376.0, 'completions/mean_length': 7711.0703125, 'completions/min_length': 290.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 7573.4052734375, 'completions/min_terminated_length': 290.0, 'completions/max_terminated_length': 16169.0, 'rewards/accuracy_reward/mean': 0.3359375, 'rewards/accuracy_reward/std': 0.47417303919792175, 'reward': 0.3359375, 'reward_std': 0.2501322627067566, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.022012868896126747, 'sampling/sampling_logp_difference/max': 7.374967098236084, 'sampling/importance_sampling_ratio/min': 0.0006267472635954618, 'sampling/importance_sampling_ratio/mean': 0.9999839067459106, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.0770929008722305, 'clip_ratio/low_mean': 4.527119426711579e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 4.350243727913039e-06, 'clip_ratio/high_max': 1.7400974911652156e-05, 'clip_ratio/region_mean': 4.962143839293276e-05, 'epoch': 0.28}
+
+ 29%|██▉       | 299/1024 [13:09:39<34:05:26, 169.28s/it][AINFO 12-02 13:23:12 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 13:23:12 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 13:23:12 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 13:23:12 [block_pool.py:292] Successfully reset prefix cache
+
+ 29%|██▉       | 300/1024 [13:12:16<33:19:05, 165.67s/it][A
+                                                         [A{'loss': 0.1094, 'grad_norm': 0.0029195898678153753, 'learning_rate': 1e-05, 'num_tokens': 243708479.0, 'completions/mean_length': 6616.5546875, 'completions/min_length': 138.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 6539.6455078125, 'completions/min_terminated_length': 138.0, 'completions/max_terminated_length': 15905.0, 'rewards/accuracy_reward/mean': 0.453125, 'rewards/accuracy_reward/std': 0.4997538626194, 'reward': 0.453125, 'reward_std': 0.3516485095024109, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.017992788925766945, 'sampling/sampling_logp_difference/max': 10.729392051696777, 'sampling/importance_sampling_ratio/min': 2.189194128732197e-05, 'sampling/importance_sampling_ratio/mean': 0.9998997449874878, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8439916148781776, 'clip_ratio/low_mean': 4.28195745598714e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 3.581290911824908e-06, 'clip_ratio/high_max': 1.4325163647299632e-05, 'clip_ratio/region_mean': 4.6400865016948956e-05, 'epoch': 0.28}
+
+ 29%|██▉       | 300/1024 [13:12:16<33:19:05, 165.67s/it][AINFO 12-02 13:25:49 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 13:25:49 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 13:25:49 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 13:25:49 [block_pool.py:292] Successfully reset prefix cache
+
+ 29%|██▉       | 301/1024 [13:14:43<32:08:52, 160.07s/it][A
+                                                         [A{'loss': 0.0853, 'grad_norm': 0.0017261393368244171, 'learning_rate': 1e-05, 'num_tokens': 244515378.0, 'completions/mean_length': 6173.5234375, 'completions/min_length': 21.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 6093.1259765625, 'completions/min_terminated_length': 21.0, 'completions/max_terminated_length': 15286.0, 'rewards/accuracy_reward/mean': 0.53125, 'rewards/accuracy_reward/std': 0.5009832978248596, 'reward': 0.53125, 'reward_std': 0.3532412052154541, 'frac_reward_zero_std': 0.1875, 'sampling/sampling_logp_difference/mean': 0.019458644092082977, 'sampling/sampling_logp_difference/max': 7.4986090660095215, 'sampling/importance_sampling_ratio/min': 0.000553854217287153, 'sampling/importance_sampling_ratio/mean': 0.9999533891677856, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8975192531943321, 'clip_ratio/low_mean': 5.01860952226707e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 4.620740583050065e-06, 'clip_ratio/high_max': 1.848296233220026e-05, 'clip_ratio/region_mean': 5.480683557834709e-05, 'epoch': 0.28}
+
+ 29%|██▉       | 301/1024 [13:14:43<32:08:52, 160.07s/it][AINFO 12-02 13:28:16 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 13:28:16 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 13:28:16 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 13:28:16 [block_pool.py:292] Successfully reset prefix cache
+
+ 29%|██▉       | 302/1024 [13:17:29<32:26:56, 161.80s/it][A
+                                                         [A{'loss': 0.0731, 'grad_norm': 0.0018768958980217576, 'learning_rate': 1e-05, 'num_tokens': 245258318.0, 'completions/mean_length': 5640.90625, 'completions/min_length': 359.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 5470.38134765625, 'completions/min_terminated_length': 359.0, 'completions/max_terminated_length': 16032.0, 'rewards/accuracy_reward/mean': 0.4609375, 'rewards/accuracy_reward/std': 0.5004304051399231, 'reward': 0.4609375, 'reward_std': 0.3135277330875397, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.019646335393190384, 'sampling/sampling_logp_difference/max': 4.936601638793945, 'sampling/importance_sampling_ratio/min': 0.0071789538487792015, 'sampling/importance_sampling_ratio/mean': 0.9999516606330872, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8833519890904427, 'clip_ratio/low_mean': 3.397437080820964e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 1.2276760230633954e-05, 'clip_ratio/high_max': 4.114005332667148e-05, 'clip_ratio/region_mean': 4.625113024303573e-05, 'epoch': 0.28}
+
+ 29%|██▉       | 302/1024 [13:17:29<32:26:56, 161.80s/it][AINFO 12-02 13:31:02 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 13:31:02 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 13:31:02 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 13:31:02 [block_pool.py:292] Successfully reset prefix cache
+
+ 30%|██▉       | 303/1024 [13:20:23<33:08:22, 165.47s/it][A
+                                                         [A{'loss': 0.0298, 'grad_norm': 0.002617602702230215, 'learning_rate': 1e-05, 'num_tokens': 246280663.0, 'completions/mean_length': 7840.5078125, 'completions/min_length': 758.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 7564.9111328125, 'completions/min_terminated_length': 758.0, 'completions/max_terminated_length': 16378.0, 'rewards/accuracy_reward/mean': 0.328125, 'rewards/accuracy_reward/std': 0.4713755249977112, 'reward': 0.328125, 'reward_std': 0.29826050996780396, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.022171074524521828, 'sampling/sampling_logp_difference/max': 7.015084266662598, 'sampling/importance_sampling_ratio/min': 0.0008982301224023104, 'sampling/importance_sampling_ratio/mean': 0.9999324083328247, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9772802665829659, 'clip_ratio/low_mean': 4.4677519781544106e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 4.514302474944998e-06, 'clip_ratio/high_max': 1.4196921938491869e-05, 'clip_ratio/region_mean': 4.919182129015098e-05, 'epoch': 0.28}
+
+ 30%|██▉       | 303/1024 [13:20:23<33:08:22, 165.47s/it][AINFO 12-02 13:33:56 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 13:33:56 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 13:33:56 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 13:33:56 [block_pool.py:292] Successfully reset prefix cache
+
+ 30%|██▉       | 304/1024 [13:23:02<32:42:40, 163.56s/it][A
+                                                         [A{'loss': 0.0513, 'grad_norm': 0.0011874900665134192, 'learning_rate': 1e-05, 'num_tokens': 247107604.0, 'completions/mean_length': 6316.1015625, 'completions/min_length': 779.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 6074.47216796875, 'completions/min_terminated_length': 779.0, 'completions/max_terminated_length': 16326.0, 'rewards/accuracy_reward/mean': 0.3828125, 'rewards/accuracy_reward/std': 0.4879830479621887, 'reward': 0.3828125, 'reward_std': 0.2227931022644043, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.018691308796405792, 'sampling/sampling_logp_difference/max': 8.749991416931152, 'sampling/importance_sampling_ratio/min': 0.00015846268797758967, 'sampling/importance_sampling_ratio/mean': 1.0000126361846924, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8542795851826668, 'clip_ratio/low_mean': 1.7621316146687604e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 0.0, 'clip_ratio/high_max': 0.0, 'clip_ratio/region_mean': 1.7621316146687604e-05, 'epoch': 0.28}
+
+ 30%|██▉       | 304/1024 [13:23:02<32:42:40, 163.56s/it][AINFO 12-02 13:36:35 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 13:36:35 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 13:36:35 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 13:36:35 [block_pool.py:292] Successfully reset prefix cache
+
+ 30%|██▉       | 305/1024 [13:25:38<32:14:29, 161.43s/it][A
+                                                         [A{'loss': 0.0725, 'grad_norm': 0.002459619427099824, 'learning_rate': 1e-05, 'num_tokens': 247967322.0, 'completions/mean_length': 6568.171875, 'completions/min_length': 97.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 6412.365234375, 'completions/min_terminated_length': 97.0, 'completions/max_terminated_length': 15782.0, 'rewards/accuracy_reward/mean': 0.5, 'rewards/accuracy_reward/std': 0.5019646286964417, 'reward': 0.5, 'reward_std': 0.3214184641838074, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.020134467631578445, 'sampling/sampling_logp_difference/max': 4.394028663635254, 'sampling/importance_sampling_ratio/min': 0.012350871227681637, 'sampling/importance_sampling_ratio/mean': 0.9998743534088135, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9063890501856804, 'clip_ratio/low_mean': 6.0967123090449604e-05, 'clip_ratio/low_min': 2.711407751121442e-05, 'clip_ratio/high_mean': 7.739938041595451e-07, 'clip_ratio/high_max': 3.0959752166381804e-06, 'clip_ratio/region_mean': 6.17411176335736e-05, 'epoch': 0.28}
+
+ 30%|██▉       | 305/1024 [13:25:38<32:14:29, 161.43s/it][AINFO 12-02 13:39:11 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 13:39:11 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 13:39:11 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 13:39:11 [block_pool.py:292] Successfully reset prefix cache
+
+ 30%|██▉       | 306/1024 [13:28:42<33:31:25, 168.09s/it][A
+                                                         [A{'loss': 0.0244, 'grad_norm': 0.0016673406353220344, 'learning_rate': 1e-05, 'num_tokens': 249031710.0, 'completions/mean_length': 8182.28125, 'completions/min_length': 877.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0625, 'completions/mean_terminated_length': 7635.50048828125, 'completions/min_terminated_length': 877.0, 'completions/max_terminated_length': 15714.0, 'rewards/accuracy_reward/mean': 0.3359375, 'rewards/accuracy_reward/std': 0.47417303919792175, 'reward': 0.3359375, 'reward_std': 0.22225631773471832, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.021704845130443573, 'sampling/sampling_logp_difference/max': 7.1247758865356445, 'sampling/importance_sampling_ratio/min': 0.0008049134048633277, 'sampling/importance_sampling_ratio/mean': 0.9998995065689087, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.0137704983353615, 'clip_ratio/low_mean': 2.400908408617397e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 1.4876959539833479e-06, 'clip_ratio/high_max': 5.9507838159333915e-06, 'clip_ratio/region_mean': 2.549678004015732e-05, 'epoch': 0.28}
+
+ 30%|██▉       | 306/1024 [13:28:42<33:31:25, 168.09s/it][AINFO 12-02 13:42:15 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 13:42:15 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 13:42:15 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 13:42:15 [block_pool.py:292] Successfully reset prefix cache
+
+ 30%|██▉       | 307/1024 [13:31:02<31:47:42, 159.64s/it][A
+                                                         [A{'loss': 0.0171, 'grad_norm': 0.0022682021372020245, 'learning_rate': 1e-05, 'num_tokens': 249881047.0, 'completions/mean_length': 6460.5703125, 'completions/min_length': 804.0, 'completions/max_length': 14502.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 6460.5703125, 'completions/min_terminated_length': 804.0, 'completions/max_terminated_length': 14502.0, 'rewards/accuracy_reward/mean': 0.359375, 'rewards/accuracy_reward/std': 0.481702595949173, 'reward': 0.359375, 'reward_std': 0.25566887855529785, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.02204791083931923, 'sampling/sampling_logp_difference/max': 5.874861240386963, 'sampling/importance_sampling_ratio/min': 0.002809183904901147, 'sampling/importance_sampling_ratio/mean': 0.9999744296073914, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.0418165400624275, 'clip_ratio/low_mean': 3.1829216595724574e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 3.6318006095825695e-06, 'clip_ratio/high_max': 1.4527202438330278e-05, 'clip_ratio/region_mean': 3.5461017205307144e-05, 'epoch': 0.28}
+
+ 30%|██▉       | 307/1024 [13:31:02<31:47:42, 159.64s/it][AINFO 12-02 13:44:35 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 13:44:35 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 13:44:35 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 13:44:35 [block_pool.py:292] Successfully reset prefix cache
+
+ 30%|███       | 308/1024 [13:33:55<32:32:48, 163.64s/it][A
+                                                         [A{'loss': 0.0637, 'grad_norm': 0.002436346374452114, 'learning_rate': 1e-05, 'num_tokens': 250773806.0, 'completions/mean_length': 6846.3046875, 'completions/min_length': 944.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 6694.9130859375, 'completions/min_terminated_length': 944.0, 'completions/max_terminated_length': 14754.0, 'rewards/accuracy_reward/mean': 0.484375, 'rewards/accuracy_reward/std': 0.5017194747924805, 'reward': 0.484375, 'reward_std': 0.34299150109291077, 'frac_reward_zero_std': 0.1875, 'sampling/sampling_logp_difference/mean': 0.02135510742664337, 'sampling/sampling_logp_difference/max': 3.6596758365631104, 'sampling/importance_sampling_ratio/min': 0.0257408544421196, 'sampling/importance_sampling_ratio/mean': 0.999980628490448, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9839218333363533, 'clip_ratio/low_mean': 4.836107154915226e-05, 'clip_ratio/low_min': 3.4611657611094415e-06, 'clip_ratio/high_mean': 4.125313353142701e-06, 'clip_ratio/high_max': 9.222687367582694e-06, 'clip_ratio/region_mean': 5.248638444754761e-05, 'epoch': 0.28}
+
+ 30%|███       | 308/1024 [13:33:55<32:32:48, 163.64s/it][AINFO 12-02 13:47:28 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 13:47:28 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 13:47:28 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 13:47:28 [block_pool.py:292] Successfully reset prefix cache
+[OpenTinker] 2025-12-02 13:49:10,403 - math_verify.grader - WARNING - Timeout during comparison
+
+ 30%|███       | 309/1024 [13:36:39<32:31:51, 163.79s/it][A
+                                                         [A{'loss': 0.0791, 'grad_norm': 0.002015948062762618, 'learning_rate': 1e-05, 'num_tokens': 251633074.0, 'completions/mean_length': 6558.53125, 'completions/min_length': 884.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 6241.58056640625, 'completions/min_terminated_length': 884.0, 'completions/max_terminated_length': 15777.0, 'rewards/accuracy_reward/mean': 0.46875, 'rewards/accuracy_reward/std': 0.5009832978248596, 'reward': 0.46875, 'reward_std': 0.27328526973724365, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.017078280448913574, 'sampling/sampling_logp_difference/max': 9.874995231628418, 'sampling/importance_sampling_ratio/min': 5.1445105782477185e-05, 'sampling/importance_sampling_ratio/mean': 0.9999434947967529, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.7833076938986778, 'clip_ratio/low_mean': 3.791964286392613e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 3.3319053045488545e-06, 'clip_ratio/high_max': 1.3327621218195418e-05, 'clip_ratio/region_mean': 4.1251548054788145e-05, 'epoch': 0.28}
+
+ 30%|███       | 309/1024 [13:36:39<32:31:51, 163.79s/it][AINFO 12-02 13:50:12 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 13:50:12 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 13:50:12 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 13:50:12 [block_pool.py:292] Successfully reset prefix cache
+
+ 30%|███       | 310/1024 [13:39:34<33:10:47, 167.29s/it][A
+                                                         [A{'loss': 0.042, 'grad_norm': 0.001098336186259985, 'learning_rate': 1e-05, 'num_tokens': 252629300.0, 'completions/mean_length': 7626.390625, 'completions/min_length': 1400.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 7487.38134765625, 'completions/min_terminated_length': 1400.0, 'completions/max_terminated_length': 16162.0, 'rewards/accuracy_reward/mean': 0.3359375, 'rewards/accuracy_reward/std': 0.47417303919792175, 'reward': 0.3359375, 'reward_std': 0.27222445607185364, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.01972624473273754, 'sampling/sampling_logp_difference/max': 8.438233375549316, 'sampling/importance_sampling_ratio/min': 0.00021643216314259917, 'sampling/importance_sampling_ratio/mean': 1.0000107288360596, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8946382254362106, 'clip_ratio/low_mean': 3.3865982686620555e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 0.0, 'clip_ratio/high_max': 0.0, 'clip_ratio/region_mean': 3.3865982686620555e-05, 'epoch': 0.29}
+
+ 30%|███       | 310/1024 [13:39:35<33:10:47, 167.29s/it][AINFO 12-02 13:53:07 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 13:53:07 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 13:53:07 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 13:53:07 [block_pool.py:292] Successfully reset prefix cache
+
+ 30%|███       | 311/1024 [13:42:00<31:49:10, 160.66s/it][A
+                                                         [A{'loss': 0.0408, 'grad_norm': 0.003871417138725519, 'learning_rate': 1e-05, 'num_tokens': 253389562.0, 'completions/mean_length': 5796.984375, 'completions/min_length': 528.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 5713.6220703125, 'completions/min_terminated_length': 528.0, 'completions/max_terminated_length': 15763.0, 'rewards/accuracy_reward/mean': 0.484375, 'rewards/accuracy_reward/std': 0.5017194747924805, 'reward': 0.484375, 'reward_std': 0.23752351105213165, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.019220752641558647, 'sampling/sampling_logp_difference/max': 10.624960899353027, 'sampling/importance_sampling_ratio/min': 2.4301782104885206e-05, 'sampling/importance_sampling_ratio/mean': 0.9998880624771118, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.969724528491497, 'clip_ratio/low_mean': 1.7658890669736138e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 1.6444445805063879e-06, 'clip_ratio/high_max': 6.5777783220255515e-06, 'clip_ratio/region_mean': 1.9303335250242526e-05, 'epoch': 0.29}
+
+ 30%|███       | 311/1024 [13:42:00<31:49:10, 160.66s/it][AINFO 12-02 13:55:33 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 13:55:33 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 13:55:33 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 13:55:33 [block_pool.py:292] Successfully reset prefix cache
+
+ 30%|███       | 312/1024 [13:45:03<33:07:44, 167.51s/it][A
+                                                         [A{'loss': -0.0089, 'grad_norm': 0.002257548039779067, 'learning_rate': 1e-05, 'num_tokens': 254295858.0, 'completions/mean_length': 6938.5625, 'completions/min_length': 769.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 6788.63525390625, 'completions/min_terminated_length': 769.0, 'completions/max_terminated_length': 15769.0, 'rewards/accuracy_reward/mean': 0.4140625, 'rewards/accuracy_reward/std': 0.49449479579925537, 'reward': 0.4140625, 'reward_std': 0.2596206068992615, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.02080199122428894, 'sampling/sampling_logp_difference/max': 6.970874309539795, 'sampling/importance_sampling_ratio/min': 0.0009388317703269422, 'sampling/importance_sampling_ratio/mean': 1.0000464916229248, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9812447279691696, 'clip_ratio/low_mean': 3.2033483023496956e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 2.8300572125772305e-06, 'clip_ratio/high_max': 8.099077376755304e-06, 'clip_ratio/region_mean': 3.486354006554393e-05, 'epoch': 0.29}
+
+ 30%|███       | 312/1024 [13:45:03<33:07:44, 167.51s/it][AINFO 12-02 13:58:36 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 13:58:36 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 13:58:36 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 13:58:36 [block_pool.py:292] Successfully reset prefix cache
+
+ 31%|███       | 313/1024 [13:47:49<32:58:55, 167.00s/it][A
+                                                         [A{'loss': 0.0179, 'grad_norm': 0.00420041661709547, 'learning_rate': 1e-05, 'num_tokens': 255197110.0, 'completions/mean_length': 6878.15625, 'completions/min_length': 302.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 6650.01611328125, 'completions/min_terminated_length': 302.0, 'completions/max_terminated_length': 15163.0, 'rewards/accuracy_reward/mean': 0.421875, 'rewards/accuracy_reward/std': 0.4957992732524872, 'reward': 0.421875, 'reward_std': 0.30433881282806396, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.02016574889421463, 'sampling/sampling_logp_difference/max': 4.185338020324707, 'sampling/importance_sampling_ratio/min': 0.015217061154544353, 'sampling/importance_sampling_ratio/mean': 0.9999925494194031, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9106859937310219, 'clip_ratio/low_mean': 3.414505465570983e-05, 'clip_ratio/low_min': 3.790060873143375e-06, 'clip_ratio/high_mean': 1.1104793884442188e-06, 'clip_ratio/high_max': 4.441917553776875e-06, 'clip_ratio/region_mean': 3.5255534044154047e-05, 'epoch': 0.29}
+
+ 31%|███       | 313/1024 [13:47:49<32:58:55, 167.00s/it][AINFO 12-02 14:01:22 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 14:01:22 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 14:01:22 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 14:01:22 [block_pool.py:292] Successfully reset prefix cache
+
+ 31%|███       | 314/1024 [13:50:15<31:40:43, 160.62s/it][A
+                                                         [A{'loss': 0.0302, 'grad_norm': 0.0027907798066735268, 'learning_rate': 1e-05, 'num_tokens': 256018935.0, 'completions/mean_length': 6260.2578125, 'completions/min_length': 790.0, 'completions/max_length': 14462.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 6260.2578125, 'completions/min_terminated_length': 790.0, 'completions/max_terminated_length': 14462.0, 'rewards/accuracy_reward/mean': 0.421875, 'rewards/accuracy_reward/std': 0.4957992732524872, 'reward': 0.421875, 'reward_std': 0.2659186124801636, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.0191945917904377, 'sampling/sampling_logp_difference/max': 9.499998092651367, 'sampling/importance_sampling_ratio/min': 7.485197420464829e-05, 'sampling/importance_sampling_ratio/mean': 1.0000364780426025, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9523455575108528, 'clip_ratio/low_mean': 3.137724206681014e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 2.203687813562283e-06, 'clip_ratio/high_max': 8.814751254249131e-06, 'clip_ratio/region_mean': 3.3580929766685585e-05, 'epoch': 0.29}
+
+ 31%|███       | 314/1024 [13:50:15<31:40:43, 160.62s/it][AINFO 12-02 14:03:48 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 14:03:48 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 14:03:48 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 14:03:48 [block_pool.py:292] Successfully reset prefix cache
+
+ 31%|███       | 315/1024 [13:52:46<31:03:34, 157.71s/it][A
+                                                         [A{'loss': 0.0379, 'grad_norm': 0.0023744129575788975, 'learning_rate': 1e-05, 'num_tokens': 256841129.0, 'completions/mean_length': 6202.828125, 'completions/min_length': 453.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 6041.22265625, 'completions/min_terminated_length': 453.0, 'completions/max_terminated_length': 16124.0, 'rewards/accuracy_reward/mean': 0.5625, 'rewards/accuracy_reward/std': 0.49802759289741516, 'reward': 0.5625, 'reward_std': 0.32407689094543457, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.019519174471497536, 'sampling/sampling_logp_difference/max': 11.588750839233398, 'sampling/importance_sampling_ratio/min': 9.269781003240496e-06, 'sampling/importance_sampling_ratio/mean': 1.0000146627426147, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8513326346874237, 'clip_ratio/low_mean': 2.780131131885355e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 7.171315814957779e-06, 'clip_ratio/high_max': 2.8685263259831117e-05, 'clip_ratio/region_mean': 3.497262770224552e-05, 'epoch': 0.29}
+
+ 31%|███       | 315/1024 [13:52:46<31:03:34, 157.71s/it][AINFO 12-02 14:06:19 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 14:06:19 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 14:06:19 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 14:06:19 [block_pool.py:292] Successfully reset prefix cache
+
+ 31%|███       | 316/1024 [13:56:00<33:09:29, 168.60s/it][A
+                                                         [A{'loss': 0.0433, 'grad_norm': 0.0024705040268599987, 'learning_rate': 1e-05, 'num_tokens': 257884188.0, 'completions/mean_length': 8019.4609375, 'completions/min_length': 262.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.1015625, 'completions/mean_terminated_length': 7073.90380859375, 'completions/min_terminated_length': 262.0, 'completions/max_terminated_length': 16328.0, 'rewards/accuracy_reward/mean': 0.3046875, 'rewards/accuracy_reward/std': 0.46208351850509644, 'reward': 0.3046875, 'reward_std': 0.2869499623775482, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.01984308287501335, 'sampling/sampling_logp_difference/max': 4.113009452819824, 'sampling/importance_sampling_ratio/min': 0.016358470544219017, 'sampling/importance_sampling_ratio/mean': 0.9999315738677979, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9211000874638557, 'clip_ratio/low_mean': 3.0394592840821133e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 4.095424401384662e-06, 'clip_ratio/high_max': 1.6381697605538648e-05, 'clip_ratio/region_mean': 3.449001792432682e-05, 'epoch': 0.29}
+
+ 31%|███       | 316/1024 [13:56:00<33:09:29, 168.60s/it][AINFO 12-02 14:09:33 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 14:09:33 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 14:09:33 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 14:09:33 [block_pool.py:292] Successfully reset prefix cache
+
+ 31%|███       | 317/1024 [13:59:07<34:14:35, 174.36s/it][A
+                                                         [A{'loss': 0.0305, 'grad_norm': 0.0032848953269422054, 'learning_rate': 1e-05, 'num_tokens': 258831852.0, 'completions/mean_length': 7257.6875, 'completions/min_length': 248.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 7038.65625, 'completions/min_terminated_length': 248.0, 'completions/max_terminated_length': 15210.0, 'rewards/accuracy_reward/mean': 0.4296875, 'rewards/accuracy_reward/std': 0.4969765841960907, 'reward': 0.4296875, 'reward_std': 0.31010788679122925, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.019743187353014946, 'sampling/sampling_logp_difference/max': 8.524773597717285, 'sampling/importance_sampling_ratio/min': 0.00019848966621793807, 'sampling/importance_sampling_ratio/mean': 0.9998986124992371, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8801277950406075, 'clip_ratio/low_mean': 3.025547425750119e-05, 'clip_ratio/low_min': 2.697337095014518e-06, 'clip_ratio/high_mean': 1.871350605142652e-06, 'clip_ratio/high_max': 7.485402420570608e-06, 'clip_ratio/region_mean': 3.212682509001752e-05, 'epoch': 0.29}
+
+ 31%|███       | 317/1024 [13:59:07<34:14:35, 174.36s/it][AINFO 12-02 14:12:40 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 14:12:40 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 14:12:40 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 14:12:40 [block_pool.py:292] Successfully reset prefix cache
+
+ 31%|███       | 318/1024 [14:01:43<33:04:34, 168.66s/it][A
+                                                         [A{'loss': 0.099, 'grad_norm': 0.004201764706522226, 'learning_rate': 1e-05, 'num_tokens': 259623512.0, 'completions/mean_length': 6042.84375, 'completions/min_length': 251.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 5878.69873046875, 'completions/min_terminated_length': 251.0, 'completions/max_terminated_length': 14903.0, 'rewards/accuracy_reward/mean': 0.640625, 'rewards/accuracy_reward/std': 0.481702595949173, 'reward': 0.640625, 'reward_std': 0.3913668990135193, 'frac_reward_zero_std': 0.125, 'sampling/sampling_logp_difference/mean': 0.018901977688074112, 'sampling/sampling_logp_difference/max': 8.187467575073242, 'sampling/importance_sampling_ratio/min': 0.00027811730979010463, 'sampling/importance_sampling_ratio/mean': 0.9998612403869629, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8792382404208183, 'clip_ratio/low_mean': 3.605492440783564e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 3.8192758893274e-06, 'clip_ratio/high_max': 1.52771035573096e-05, 'clip_ratio/region_mean': 3.987420052453672e-05, 'epoch': 0.29}
+
+ 31%|███       | 318/1024 [14:01:43<33:04:34, 168.66s/it][AINFO 12-02 14:15:16 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 14:15:16 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 14:15:16 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 14:15:16 [block_pool.py:292] Successfully reset prefix cache
+
+ 31%|███       | 319/1024 [14:04:41<33:35:09, 171.50s/it][A
+                                                         [A{'loss': 0.0383, 'grad_norm': 0.0014557713875547051, 'learning_rate': 1e-05, 'num_tokens': 260623928.0, 'completions/mean_length': 7667.6875, 'completions/min_length': 4.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 7458.49658203125, 'completions/min_terminated_length': 4.0, 'completions/max_terminated_length': 16381.0, 'rewards/accuracy_reward/mean': 0.3515625, 'rewards/accuracy_reward/std': 0.4793342351913452, 'reward': 0.3515625, 'reward_std': 0.22726887464523315, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.01979639381170273, 'sampling/sampling_logp_difference/max': 8.248785018920898, 'sampling/importance_sampling_ratio/min': 0.0002615761768538505, 'sampling/importance_sampling_ratio/mean': 0.9999264478683472, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9096411988139153, 'clip_ratio/low_mean': 4.1642084397608414e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 0.0, 'clip_ratio/high_max': 0.0, 'clip_ratio/region_mean': 4.1642084397608414e-05, 'epoch': 0.29}
+
+ 31%|███       | 319/1024 [14:04:41<33:35:09, 171.50s/it][AINFO 12-02 14:18:14 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 14:18:14 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 14:18:14 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 14:18:14 [block_pool.py:292] Successfully reset prefix cache
+
+ 31%|███▏      | 320/1024 [14:07:18<32:42:09, 167.23s/it][A
+                                                         [A{'loss': 0.0589, 'grad_norm': 0.0028210312593728304, 'learning_rate': 1e-05, 'num_tokens': 261465625.0, 'completions/mean_length': 6428.8203125, 'completions/min_length': 617.0, 'completions/max_length': 15514.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 6428.8203125, 'completions/min_terminated_length': 617.0, 'completions/max_terminated_length': 15514.0, 'rewards/accuracy_reward/mean': 0.46875, 'rewards/accuracy_reward/std': 0.5009832978248596, 'reward': 0.46875, 'reward_std': 0.3169426918029785, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.021066997200250626, 'sampling/sampling_logp_difference/max': 6.704597473144531, 'sampling/importance_sampling_ratio/min': 0.001225265790708363, 'sampling/importance_sampling_ratio/mean': 1.0000195503234863, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9974069148302078, 'clip_ratio/low_mean': 2.704614530557592e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 5.90047670812055e-06, 'clip_ratio/high_max': 2.36019068324822e-05, 'clip_ratio/region_mean': 3.2946622809504333e-05, 'epoch': 0.29}
+
+ 31%|███▏      | 320/1024 [14:07:18<32:42:09, 167.23s/it][AINFO 12-02 14:20:54 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 14:20:54 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 14:20:54 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 14:20:54 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 31%|███▏      | 321/1024 [14:09:46<31:31:59, 161.48s/it][A
+                                                         [A{'loss': 0.0357, 'grad_norm': 0.0024263609666377306, 'learning_rate': 1e-05, 'num_tokens': 262208475.0, 'completions/mean_length': 5664.515625, 'completions/min_length': 299.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 5580.1103515625, 'completions/min_terminated_length': 299.0, 'completions/max_terminated_length': 15224.0, 'rewards/accuracy_reward/mean': 0.4765625, 'rewards/accuracy_reward/std': 0.5014128684997559, 'reward': 0.4765625, 'reward_std': 0.26409637928009033, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.01997508481144905, 'sampling/sampling_logp_difference/max': 9.152630805969238, 'sampling/importance_sampling_ratio/min': 0.0001059407222783193, 'sampling/importance_sampling_ratio/mean': 0.9998950958251953, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9557281509041786, 'clip_ratio/low_mean': 3.7066520235384814e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 7.408613214465731e-06, 'clip_ratio/high_max': 2.9634452857862925e-05, 'clip_ratio/region_mean': 4.447513333616371e-05, 'epoch': 0.3}
+
+ 31%|███▏      | 321/1024 [14:09:46<31:31:59, 161.48s/it][AINFO 12-02 14:23:19 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 14:23:19 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 14:23:19 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 14:23:19 [block_pool.py:292] Successfully reset prefix cache
+
+ 31%|███▏      | 322/1024 [14:12:46<32:32:15, 166.86s/it][A
+                                                         [A{'loss': 0.0366, 'grad_norm': 0.0025556792970746756, 'learning_rate': 1e-05, 'num_tokens': 263078672.0, 'completions/mean_length': 6649.5390625, 'completions/min_length': 599.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.046875, 'completions/mean_terminated_length': 6170.794921875, 'completions/min_terminated_length': 599.0, 'completions/max_terminated_length': 15566.0, 'rewards/accuracy_reward/mean': 0.453125, 'rewards/accuracy_reward/std': 0.4997538626194, 'reward': 0.453125, 'reward_std': 0.3214184641838074, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.019613387063145638, 'sampling/sampling_logp_difference/max': 10.223334312438965, 'sampling/importance_sampling_ratio/min': 3.631301660789177e-05, 'sampling/importance_sampling_ratio/mean': 0.9998431205749512, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9003193452954292, 'clip_ratio/low_mean': 4.980480150607036e-05, 'clip_ratio/low_min': 5.136423624207964e-06, 'clip_ratio/high_mean': 5.685056066795369e-06, 'clip_ratio/high_max': 1.9527269159880234e-05, 'clip_ratio/region_mean': 5.5489856435997353e-05, 'epoch': 0.3}
+
+ 31%|███▏      | 322/1024 [14:12:46<32:32:15, 166.86s/it][AINFO 12-02 14:26:19 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 14:26:19 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 14:26:19 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 14:26:19 [block_pool.py:292] Successfully reset prefix cache
+
+ 32%|███▏      | 323/1024 [14:15:19<31:41:05, 162.72s/it][A
+                                                         [A{'loss': 0.077, 'grad_norm': 0.0008845282136462629, 'learning_rate': 1e-05, 'num_tokens': 263843797.0, 'completions/mean_length': 5819.4140625, 'completions/min_length': 701.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 5478.62060546875, 'completions/min_terminated_length': 701.0, 'completions/max_terminated_length': 15190.0, 'rewards/accuracy_reward/mean': 0.5390625, 'rewards/accuracy_reward/std': 0.5004304051399231, 'reward': 0.5390625, 'reward_std': 0.14913026988506317, 'frac_reward_zero_std': 0.6875, 'sampling/sampling_logp_difference/mean': 0.02007308602333069, 'sampling/sampling_logp_difference/max': 2.6942083835601807, 'sampling/importance_sampling_ratio/min': 0.06759586930274963, 'sampling/importance_sampling_ratio/mean': 0.9999452233314514, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9234923645853996, 'clip_ratio/low_mean': 1.1492368912513484e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 0.0, 'clip_ratio/high_max': 0.0, 'clip_ratio/region_mean': 1.1492368912513484e-05, 'epoch': 0.3}
+
+ 32%|███▏      | 323/1024 [14:15:19<31:41:05, 162.72s/it][AINFO 12-02 14:28:52 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 14:28:52 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 14:28:52 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 14:28:52 [block_pool.py:292] Successfully reset prefix cache
+
+ 32%|███▏      | 324/1024 [14:18:25<33:00:57, 169.80s/it][A
+                                                         [A{'loss': 0.0633, 'grad_norm': 0.001527746208012104, 'learning_rate': 1e-05, 'num_tokens': 264751769.0, 'completions/mean_length': 6952.96875, 'completions/min_length': 506.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 6726.62451171875, 'completions/min_terminated_length': 506.0, 'completions/max_terminated_length': 16033.0, 'rewards/accuracy_reward/mean': 0.453125, 'rewards/accuracy_reward/std': 0.4997538626194, 'reward': 0.453125, 'reward_std': 0.23410367965698242, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.019622590392827988, 'sampling/sampling_logp_difference/max': 7.590427875518799, 'sampling/importance_sampling_ratio/min': 0.000505264790263027, 'sampling/importance_sampling_ratio/mean': 0.9999520778656006, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8909401148557663, 'clip_ratio/low_mean': 2.420720869622528e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 2.9219773409749905e-06, 'clip_ratio/high_max': 1.1687909363899962e-05, 'clip_ratio/region_mean': 2.7129186207730527e-05, 'epoch': 0.3}
+
+ 32%|███▏      | 324/1024 [14:18:25<33:00:57, 169.80s/it][AINFO 12-02 14:31:58 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 14:31:58 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 14:31:58 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 14:31:58 [block_pool.py:292] Successfully reset prefix cache
+
+ 32%|███▏      | 325/1024 [14:21:25<33:33:23, 172.82s/it][A
+                                                         [A{'loss': 0.0771, 'grad_norm': 0.003113618353381753, 'learning_rate': 1e-05, 'num_tokens': 265810580.0, 'completions/mean_length': 8138.5234375, 'completions/min_length': 268.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.046875, 'completions/mean_terminated_length': 7733.0078125, 'completions/min_terminated_length': 268.0, 'completions/max_terminated_length': 15667.0, 'rewards/accuracy_reward/mean': 0.40625, 'rewards/accuracy_reward/std': 0.4930621087551117, 'reward': 0.40625, 'reward_std': 0.27328526973724365, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.02168515883386135, 'sampling/sampling_logp_difference/max': 16.189363479614258, 'sampling/importance_sampling_ratio/min': 9.312124404914357e-08, 'sampling/importance_sampling_ratio/mean': 0.9998828172683716, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.972789965569973, 'clip_ratio/low_mean': 5.333864191925386e-05, 'clip_ratio/low_min': 5.043169494456379e-06, 'clip_ratio/high_mean': 4.600909505825257e-06, 'clip_ratio/high_max': 1.5079081094881985e-05, 'clip_ratio/region_mean': 5.793955187982647e-05, 'epoch': 0.3}
+
+ 32%|███▏      | 325/1024 [14:21:25<33:33:23, 172.82s/it][AINFO 12-02 14:34:58 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 14:34:58 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 14:34:58 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 14:34:58 [block_pool.py:292] Successfully reset prefix cache
+
+ 32%|███▏      | 326/1024 [14:24:30<34:12:06, 176.40s/it][A
+                                                         [A{'loss': 0.0509, 'grad_norm': 0.0009589543915353715, 'learning_rate': 1e-05, 'num_tokens': 266796097.0, 'completions/mean_length': 7565.6015625, 'completions/min_length': 1017.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.046875, 'completions/mean_terminated_length': 7131.90966796875, 'completions/min_terminated_length': 1017.0, 'completions/max_terminated_length': 16248.0, 'rewards/accuracy_reward/mean': 0.5078125, 'rewards/accuracy_reward/std': 0.5019033551216125, 'reward': 0.5078125, 'reward_std': 0.16834920644760132, 'frac_reward_zero_std': 0.5625, 'sampling/sampling_logp_difference/mean': 0.01885361596941948, 'sampling/sampling_logp_difference/max': 6.374822616577148, 'sampling/importance_sampling_ratio/min': 0.0017039220547303557, 'sampling/importance_sampling_ratio/mean': 0.9999743700027466, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.835600845515728, 'clip_ratio/low_mean': 3.45970811395091e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 1.1159518180647865e-06, 'clip_ratio/high_max': 4.463807272259146e-06, 'clip_ratio/region_mean': 3.571303295757389e-05, 'epoch': 0.3}
+
+ 32%|███▏      | 326/1024 [14:24:30<34:12:06, 176.40s/it][AINFO 12-02 14:38:03 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 14:38:03 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 14:38:03 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 14:38:03 [block_pool.py:292] Successfully reset prefix cache
+
+ 32%|███▏      | 327/1024 [14:27:27<34:13:46, 176.80s/it][A
+                                                         [A{'loss': 0.0514, 'grad_norm': 0.0024741124361753464, 'learning_rate': 1e-05, 'num_tokens': 267727528.0, 'completions/mean_length': 7124.0546875, 'completions/min_length': 428.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.046875, 'completions/mean_terminated_length': 6668.64697265625, 'completions/min_terminated_length': 428.0, 'completions/max_terminated_length': 16167.0, 'rewards/accuracy_reward/mean': 0.4296875, 'rewards/accuracy_reward/std': 0.4969765841960907, 'reward': 0.4296875, 'reward_std': 0.23592591285705566, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.01998118683695793, 'sampling/sampling_logp_difference/max': 9.980022430419922, 'sampling/importance_sampling_ratio/min': 4.63160322397016e-05, 'sampling/importance_sampling_ratio/mean': 0.999866247177124, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9041655585169792, 'clip_ratio/low_mean': 5.806843591926736e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 5.651615083479555e-06, 'clip_ratio/high_max': 2.260646033391822e-05, 'clip_ratio/region_mean': 6.372005145749426e-05, 'epoch': 0.3}
+
+ 32%|███▏      | 327/1024 [14:27:27<34:13:46, 176.80s/it][AINFO 12-02 14:41:00 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 14:41:00 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 14:41:00 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 14:41:00 [block_pool.py:292] Successfully reset prefix cache
+
+ 32%|███▏      | 328/1024 [14:30:16<33:40:48, 174.21s/it][A
+                                                         [A{'loss': 0.0147, 'grad_norm': 0.002233455190435052, 'learning_rate': 1e-05, 'num_tokens': 268610868.0, 'completions/mean_length': 6757.65625, 'completions/min_length': 1123.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 6681.8583984375, 'completions/min_terminated_length': 1123.0, 'completions/max_terminated_length': 14834.0, 'rewards/accuracy_reward/mean': 0.375, 'rewards/accuracy_reward/std': 0.4860251843929291, 'reward': 0.375, 'reward_std': 0.23857943713665009, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.021600255742669106, 'sampling/sampling_logp_difference/max': 12.616476058959961, 'sampling/importance_sampling_ratio/min': 3.3169128528243164e-06, 'sampling/importance_sampling_ratio/mean': 0.9999549984931946, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.105302907526493, 'clip_ratio/low_mean': 3.374219397755951e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 5.456775966194982e-06, 'clip_ratio/high_max': 1.7461054540035548e-05, 'clip_ratio/region_mean': 3.919897017112817e-05, 'epoch': 0.3}
+
+ 32%|███▏      | 328/1024 [14:30:16<33:40:48, 174.21s/it][AINFO 12-02 14:43:48 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 14:43:48 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 14:43:48 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 14:43:48 [block_pool.py:292] Successfully reset prefix cache
+
+ 32%|███▏      | 329/1024 [14:33:14<33:51:30, 175.38s/it][A
+                                                         [A{'loss': 0.0845, 'grad_norm': 0.0017964976141229272, 'learning_rate': 1e-05, 'num_tokens': 269594867.0, 'completions/mean_length': 7522.5546875, 'completions/min_length': 1390.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 7381.8974609375, 'completions/min_terminated_length': 1390.0, 'completions/max_terminated_length': 16147.0, 'rewards/accuracy_reward/mean': 0.421875, 'rewards/accuracy_reward/std': 0.4957992732524872, 'reward': 0.421875, 'reward_std': 0.28223684430122375, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.021326296031475067, 'sampling/sampling_logp_difference/max': 6.098084449768066, 'sampling/importance_sampling_ratio/min': 0.002247168216854334, 'sampling/importance_sampling_ratio/mean': 0.999937891960144, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.0577925741672516, 'clip_ratio/low_mean': 4.300070588669769e-05, 'clip_ratio/low_min': 3.6705330330732977e-06, 'clip_ratio/high_mean': 4.378551011541276e-06, 'clip_ratio/high_max': 1.7514204046165105e-05, 'clip_ratio/region_mean': 4.7379256784552126e-05, 'epoch': 0.3}
+
+ 32%|███▏      | 329/1024 [14:33:14<33:51:30, 175.38s/it][AINFO 12-02 14:46:47 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 14:46:47 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 14:46:47 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 14:46:47 [block_pool.py:292] Successfully reset prefix cache
+
+ 32%|███▏      | 330/1024 [14:35:41<32:09:55, 166.85s/it][A
+                                                         [A{'loss': 0.0291, 'grad_norm': 0.0023369218688458204, 'learning_rate': 1e-05, 'num_tokens': 270410785.0, 'completions/mean_length': 6232.109375, 'completions/min_length': 1238.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 5904.62890625, 'completions/min_terminated_length': 1238.0, 'completions/max_terminated_length': 14462.0, 'rewards/accuracy_reward/mean': 0.6015625, 'rewards/accuracy_reward/std': 0.4915000796318054, 'reward': 0.6015625, 'reward_std': 0.23516449332237244, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.018453873693943024, 'sampling/sampling_logp_difference/max': 9.154382705688477, 'sampling/importance_sampling_ratio/min': 0.00010575528722256422, 'sampling/importance_sampling_ratio/mean': 1.000063180923462, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8473618850111961, 'clip_ratio/low_mean': 2.5991578013417893e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 4.252754024491878e-06, 'clip_ratio/high_max': 1.7011016097967513e-05, 'clip_ratio/region_mean': 3.0244332265283447e-05, 'epoch': 0.3}
+
+ 32%|███▏      | 330/1024 [14:35:41<32:09:55, 166.85s/it][AINFO 12-02 14:49:14 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 14:49:14 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 14:49:14 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 14:49:14 [block_pool.py:292] Successfully reset prefix cache
+
+ 32%|███▏      | 331/1024 [14:38:02<30:39:47, 159.29s/it][A
+                                                         [A{'loss': 0.0672, 'grad_norm': 0.0014558705734089017, 'learning_rate': 1e-05, 'num_tokens': 271179113.0, 'completions/mean_length': 5847.0625, 'completions/min_length': 27.0, 'completions/max_length': 14454.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 5847.0625, 'completions/min_terminated_length': 27.0, 'completions/max_terminated_length': 14454.0, 'rewards/accuracy_reward/mean': 0.5390625, 'rewards/accuracy_reward/std': 0.5004304051399231, 'reward': 0.5390625, 'reward_std': 0.22673210501670837, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.017629161477088928, 'sampling/sampling_logp_difference/max': 10.822555541992188, 'sampling/importance_sampling_ratio/min': 1.994453305087518e-05, 'sampling/importance_sampling_ratio/mean': 1.000067114830017, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8186105340719223, 'clip_ratio/low_mean': 3.064826853460545e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 4.300789669287042e-06, 'clip_ratio/high_max': 1.2072427125531249e-05, 'clip_ratio/region_mean': 3.4949058090205654e-05, 'epoch': 0.3}
+
+ 32%|███▏      | 331/1024 [14:38:02<30:39:47, 159.29s/it][AINFO 12-02 14:51:35 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 14:51:35 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 14:51:35 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 14:51:35 [block_pool.py:292] Successfully reset prefix cache
+
+ 32%|███▏      | 332/1024 [14:40:50<31:05:50, 161.78s/it][A
+                                                         [A{'loss': 0.051, 'grad_norm': 0.002496426459401846, 'learning_rate': 1e-05, 'num_tokens': 272054510.0, 'completions/mean_length': 6670.6015625, 'completions/min_length': 24.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 6516.4208984375, 'completions/min_terminated_length': 24.0, 'completions/max_terminated_length': 16065.0, 'rewards/accuracy_reward/mean': 0.328125, 'rewards/accuracy_reward/std': 0.4713755249977112, 'reward': 0.328125, 'reward_std': 0.29932624101638794, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.020175442099571228, 'sampling/sampling_logp_difference/max': 9.124674797058105, 'sampling/importance_sampling_ratio/min': 0.00010894420120166615, 'sampling/importance_sampling_ratio/mean': 0.9998705387115479, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9379853457212448, 'clip_ratio/low_mean': 5.131868192620459e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 8.719567063053546e-06, 'clip_ratio/high_max': 3.204624090358266e-05, 'clip_ratio/region_mean': 6.0038249102944974e-05, 'epoch': 0.31}
+
+ 32%|███▏      | 332/1024 [14:40:50<31:05:50, 161.78s/it][AINFO 12-02 14:54:23 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 14:54:23 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 14:54:23 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 14:54:23 [block_pool.py:292] Successfully reset prefix cache
+
+ 33%|███▎      | 333/1024 [14:43:13<29:57:27, 156.08s/it][A
+                                                         [A{'loss': 0.0257, 'grad_norm': 0.0028466631192713976, 'learning_rate': 1e-05, 'num_tokens': 272920304.0, 'completions/mean_length': 6613.328125, 'completions/min_length': 439.0, 'completions/max_length': 15716.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 6613.328125, 'completions/min_terminated_length': 439.0, 'completions/max_terminated_length': 15716.0, 'rewards/accuracy_reward/mean': 0.3359375, 'rewards/accuracy_reward/std': 0.47417303919792175, 'reward': 0.3359375, 'reward_std': 0.32089439034461975, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.02250460349023342, 'sampling/sampling_logp_difference/max': 3.511396884918213, 'sampling/importance_sampling_ratio/min': 0.02985518053174019, 'sampling/importance_sampling_ratio/mean': 0.9999476671218872, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.0781218782067299, 'clip_ratio/low_mean': 6.672416202491149e-05, 'clip_ratio/low_min': 4.344501576269977e-06, 'clip_ratio/high_mean': 2.827989874276682e-06, 'clip_ratio/high_max': 1.1311959497106727e-05, 'clip_ratio/region_mean': 6.955215212656185e-05, 'epoch': 0.31}
+
+ 33%|███▎      | 333/1024 [14:43:13<29:57:27, 156.08s/it][AINFO 12-02 14:56:46 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 14:56:46 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 14:56:46 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 14:56:46 [block_pool.py:292] Successfully reset prefix cache
+
+ 33%|███▎      | 334/1024 [14:46:20<31:41:07, 165.32s/it][A
+                                                         [A{'loss': 0.0564, 'grad_norm': 0.0027409526519477367, 'learning_rate': 1e-05, 'num_tokens': 273789588.0, 'completions/mean_length': 6632.78125, 'completions/min_length': 888.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 6318.2255859375, 'completions/min_terminated_length': 888.0, 'completions/max_terminated_length': 15572.0, 'rewards/accuracy_reward/mean': 0.3671875, 'rewards/accuracy_reward/std': 0.4839322865009308, 'reward': 0.3671875, 'reward_std': 0.12863078713417053, 'frac_reward_zero_std': 0.6875, 'sampling/sampling_logp_difference/mean': 0.02000725269317627, 'sampling/sampling_logp_difference/max': 9.374701499938965, 'sampling/importance_sampling_ratio/min': 8.484355930704623e-05, 'sampling/importance_sampling_ratio/mean': 0.9999409914016724, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9595735669136047, 'clip_ratio/low_mean': 2.3429964585375274e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 0.0, 'clip_ratio/high_max': 0.0, 'clip_ratio/region_mean': 2.3429964585375274e-05, 'epoch': 0.31}
+
+ 33%|███▎      | 334/1024 [14:46:20<31:41:07, 165.32s/it][AINFO 12-02 14:59:52 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 14:59:52 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 14:59:52 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 14:59:52 [block_pool.py:292] Successfully reset prefix cache
+
+ 33%|███▎      | 335/1024 [14:49:20<32:30:54, 169.89s/it][A
+                                                         [A{'loss': 0.0521, 'grad_norm': 0.001853835303336382, 'learning_rate': 1e-05, 'num_tokens': 274843754.0, 'completions/mean_length': 8083.421875, 'completions/min_length': 631.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 7884.20849609375, 'completions/min_terminated_length': 631.0, 'completions/max_terminated_length': 15567.0, 'rewards/accuracy_reward/mean': 0.2734375, 'rewards/accuracy_reward/std': 0.447474867105484, 'reward': 0.2734375, 'reward_std': 0.29719969630241394, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.023757295683026314, 'sampling/sampling_logp_difference/max': 12.007329940795898, 'sampling/importance_sampling_ratio/min': 6.099340225773631e-06, 'sampling/importance_sampling_ratio/mean': 0.999961256980896, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.139024168252945, 'clip_ratio/low_mean': 6.270217818382662e-05, 'clip_ratio/low_min': 1.282997527596308e-05, 'clip_ratio/high_mean': 2.6212845796180773e-06, 'clip_ratio/high_max': 1.0485138318472309e-05, 'clip_ratio/region_mean': 6.532346287713153e-05, 'epoch': 0.31}
+
+ 33%|███▎      | 335/1024 [14:49:20<32:30:54, 169.89s/it][AINFO 12-02 15:02:53 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 15:02:53 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 15:02:53 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 15:02:53 [block_pool.py:292] Successfully reset prefix cache
+
+ 33%|███▎      | 336/1024 [14:52:04<32:07:13, 168.07s/it][A
+                                                         [A{'loss': 0.0179, 'grad_norm': 0.002049664966762066, 'learning_rate': 1e-05, 'num_tokens': 275750023.0, 'completions/mean_length': 6932.6640625, 'completions/min_length': 666.0, 'completions/max_length': 16058.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 6932.6640625, 'completions/min_terminated_length': 666.0, 'completions/max_terminated_length': 16058.0, 'rewards/accuracy_reward/mean': 0.21875, 'rewards/accuracy_reward/std': 0.41502299904823303, 'reward': 0.21875, 'reward_std': 0.23934084177017212, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.021840902045369148, 'sampling/sampling_logp_difference/max': 9.847487449645996, 'sampling/importance_sampling_ratio/min': 5.287989188218489e-05, 'sampling/importance_sampling_ratio/mean': 1.0000157356262207, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.2969390451908112, 'clip_ratio/low_mean': 3.2649955073793535e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 1.6395592865592334e-06, 'clip_ratio/high_max': 6.558237146236934e-06, 'clip_ratio/region_mean': 3.428951481510012e-05, 'epoch': 0.31}
+
+ 33%|███▎      | 336/1024 [14:52:04<32:07:13, 168.07s/it][AINFO 12-02 15:05:37 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 15:05:37 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 15:05:37 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 15:05:37 [block_pool.py:292] Successfully reset prefix cache
+
+ 33%|███▎      | 337/1024 [14:55:11<33:11:29, 173.93s/it][A
+                                                         [A{'loss': 0.0487, 'grad_norm': 0.0030745298136025667, 'learning_rate': 1e-05, 'num_tokens': 276750011.0, 'completions/mean_length': 7630.65625, 'completions/min_length': 1002.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0546875, 'completions/mean_terminated_length': 7124.26416015625, 'completions/min_terminated_length': 1002.0, 'completions/max_terminated_length': 15213.0, 'rewards/accuracy_reward/mean': 0.3125, 'rewards/accuracy_reward/std': 0.4653336703777313, 'reward': 0.3125, 'reward_std': 0.30091896653175354, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.021394159644842148, 'sampling/sampling_logp_difference/max': 10.365766525268555, 'sampling/importance_sampling_ratio/min': 3.149233089061454e-05, 'sampling/importance_sampling_ratio/mean': 0.9999728798866272, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.959126852452755, 'clip_ratio/low_mean': 3.607215444390022e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 1.2956589898749371e-06, 'clip_ratio/high_max': 5.1826359594997484e-06, 'clip_ratio/region_mean': 3.736781377483567e-05, 'epoch': 0.31}
+
+ 33%|███▎      | 337/1024 [14:55:11<33:11:29, 173.93s/it][AINFO 12-02 15:08:44 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 15:08:44 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 15:08:44 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 15:08:44 [block_pool.py:292] Successfully reset prefix cache
+
+ 33%|███▎      | 338/1024 [14:58:05<33:06:48, 173.77s/it][A
+                                                         [A{'loss': 0.0686, 'grad_norm': 0.0014476332580670714, 'learning_rate': 1e-05, 'num_tokens': 277715450.0, 'completions/mean_length': 7383.2421875, 'completions/min_length': 432.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 7240.37353515625, 'completions/min_terminated_length': 432.0, 'completions/max_terminated_length': 15907.0, 'rewards/accuracy_reward/mean': 0.4140625, 'rewards/accuracy_reward/std': 0.49449479579925537, 'reward': 0.4140625, 'reward_std': 0.2477683424949646, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.020672230049967766, 'sampling/sampling_logp_difference/max': 10.00327205657959, 'sampling/importance_sampling_ratio/min': 4.5251621486386284e-05, 'sampling/importance_sampling_ratio/mean': 0.9999750256538391, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.1512386053800583, 'clip_ratio/low_mean': 2.64957521380893e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 2.5604765028219845e-06, 'clip_ratio/high_max': 6.921764679646003e-06, 'clip_ratio/region_mean': 2.905622847038103e-05, 'epoch': 0.31}
+
+ 33%|███▎      | 338/1024 [14:58:05<33:06:48, 173.77s/it][AINFO 12-02 15:11:38 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 15:11:38 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 15:11:38 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 15:11:38 [block_pool.py:292] Successfully reset prefix cache
+
+ 33%|███▎      | 339/1024 [15:00:44<32:12:29, 169.27s/it][A
+                                                         [A{'loss': 0.0067, 'grad_norm': 0.0018895689863711596, 'learning_rate': 1e-05, 'num_tokens': 278491688.0, 'completions/mean_length': 5876.421875, 'completions/min_length': 116.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 5793.68505859375, 'completions/min_terminated_length': 116.0, 'completions/max_terminated_length': 14693.0, 'rewards/accuracy_reward/mean': 0.3984375, 'rewards/accuracy_reward/std': 0.4915000796318054, 'reward': 0.3984375, 'reward_std': 0.21146979928016663, 'frac_reward_zero_std': 0.5625, 'sampling/sampling_logp_difference/mean': 0.020282316952943802, 'sampling/sampling_logp_difference/max': 7.248683452606201, 'sampling/importance_sampling_ratio/min': 0.0007111100130714476, 'sampling/importance_sampling_ratio/mean': 0.9998824596405029, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.0786077454686165, 'clip_ratio/low_mean': 3.8645826748506806e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 9.255419968212664e-07, 'clip_ratio/high_max': 3.7021679872850655e-06, 'clip_ratio/region_mean': 3.957136880217149e-05, 'epoch': 0.31}
+
+ 33%|███▎      | 339/1024 [15:00:44<32:12:29, 169.27s/it][AINFO 12-02 15:14:17 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 15:14:17 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 15:14:17 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 15:14:17 [block_pool.py:292] Successfully reset prefix cache
+
+ 33%|███▎      | 340/1024 [15:03:32<32:06:29, 168.99s/it][A
+                                                         [A{'loss': 0.0222, 'grad_norm': 0.002133915899321437, 'learning_rate': 1e-05, 'num_tokens': 279427384.0, 'completions/mean_length': 7162.625, 'completions/min_length': 986.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.046875, 'completions/mean_terminated_length': 6709.1142578125, 'completions/min_terminated_length': 986.0, 'completions/max_terminated_length': 14627.0, 'rewards/accuracy_reward/mean': 0.4453125, 'rewards/accuracy_reward/std': 0.4989531338214874, 'reward': 0.4453125, 'reward_std': 0.32142335176467896, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.019643021747469902, 'sampling/sampling_logp_difference/max': 5.329782009124756, 'sampling/importance_sampling_ratio/min': 0.004845126066356897, 'sampling/importance_sampling_ratio/mean': 1.0000159740447998, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.898807168006897, 'clip_ratio/low_mean': 2.9699310402975243e-05, 'clip_ratio/low_min': 4.435140454006614e-06, 'clip_ratio/high_mean': 4.685133262682939e-06, 'clip_ratio/high_max': 1.8740533050731756e-05, 'clip_ratio/region_mean': 3.4384443438284507e-05, 'epoch': 0.31}
+
+ 33%|███▎      | 340/1024 [15:03:32<32:06:29, 168.99s/it][AINFO 12-02 15:17:05 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 15:17:05 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 15:17:05 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 15:17:05 [block_pool.py:292] Successfully reset prefix cache
+
+ 33%|███▎      | 341/1024 [15:06:13<31:35:38, 166.53s/it][A
+                                                         [A{'loss': 0.0866, 'grad_norm': 0.0023624920286238194, 'learning_rate': 1e-05, 'num_tokens': 280352177.0, 'completions/mean_length': 7072.3828125, 'completions/min_length': 93.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 6999.06298828125, 'completions/min_terminated_length': 93.0, 'completions/max_terminated_length': 15965.0, 'rewards/accuracy_reward/mean': 0.375, 'rewards/accuracy_reward/std': 0.4860251843929291, 'reward': 0.375, 'reward_std': 0.36637401580810547, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.020037520676851273, 'sampling/sampling_logp_difference/max': 7.100006580352783, 'sampling/importance_sampling_ratio/min': 0.0008250995306298137, 'sampling/importance_sampling_ratio/mean': 0.9999604225158691, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8942967653274536, 'clip_ratio/low_mean': 6.452910844245707e-05, 'clip_ratio/low_min': 9.302988473791629e-06, 'clip_ratio/high_mean': 5.561973125622899e-06, 'clip_ratio/high_max': 1.472241683586617e-05, 'clip_ratio/region_mean': 7.009108327338254e-05, 'epoch': 0.31}
+
+ 33%|███▎      | 341/1024 [15:06:13<31:35:38, 166.53s/it][AINFO 12-02 15:19:46 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 15:19:46 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 15:19:46 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 15:19:46 [block_pool.py:292] Successfully reset prefix cache
+
+ 33%|███▎      | 342/1024 [15:09:07<31:57:58, 168.74s/it][A
+                                                         [A{'loss': 0.0539, 'grad_norm': 0.0025228122249245644, 'learning_rate': 1e-05, 'num_tokens': 281208411.0, 'completions/mean_length': 6553.203125, 'completions/min_length': 664.0, 'completions/max_length': 16300.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 6553.203125, 'completions/min_terminated_length': 664.0, 'completions/max_terminated_length': 16300.0, 'rewards/accuracy_reward/mean': 0.40625, 'rewards/accuracy_reward/std': 0.4930621087551117, 'reward': 0.40625, 'reward_std': 0.3390446603298187, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.018958289176225662, 'sampling/sampling_logp_difference/max': 8.108687400817871, 'sampling/importance_sampling_ratio/min': 0.00030091358348727226, 'sampling/importance_sampling_ratio/mean': 0.9999656677246094, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8765531405806541, 'clip_ratio/low_mean': 4.717265596809739e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 0.0, 'clip_ratio/high_max': 0.0, 'clip_ratio/region_mean': 4.717265596809739e-05, 'epoch': 0.31}
+
+ 33%|███▎      | 342/1024 [15:09:07<31:57:58, 168.74s/it][AINFO 12-02 15:22:40 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 15:22:40 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 15:22:40 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 15:22:40 [block_pool.py:292] Successfully reset prefix cache
+
+ 33%|███▎      | 343/1024 [15:11:49<31:33:50, 166.86s/it][A
+                                                         [A{'loss': 0.119, 'grad_norm': 0.003243578365072608, 'learning_rate': 1e-05, 'num_tokens': 282059863.0, 'completions/mean_length': 6489.40625, 'completions/min_length': 374.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0390625, 'completions/mean_terminated_length': 6087.1865234375, 'completions/min_terminated_length': 374.0, 'completions/max_terminated_length': 14916.0, 'rewards/accuracy_reward/mean': 0.515625, 'rewards/accuracy_reward/std': 0.5017194747924805, 'reward': 0.515625, 'reward_std': 0.39689862728118896, 'frac_reward_zero_std': 0.125, 'sampling/sampling_logp_difference/mean': 0.01820875145494938, 'sampling/sampling_logp_difference/max': 8.233909606933594, 'sampling/importance_sampling_ratio/min': 0.00026549631729722023, 'sampling/importance_sampling_ratio/mean': 0.9999314546585083, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8384068235754967, 'clip_ratio/low_mean': 6.593948137378902e-05, 'clip_ratio/low_min': 1.4238520634535234e-05, 'clip_ratio/high_mean': 3.890525249516941e-06, 'clip_ratio/high_max': 1.5562100998067763e-05, 'clip_ratio/region_mean': 6.983000685067964e-05, 'epoch': 0.32}
+
+ 33%|███▎      | 343/1024 [15:11:49<31:33:50, 166.86s/it][AINFO 12-02 15:25:22 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 15:25:22 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 15:25:22 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 15:25:22 [block_pool.py:292] Successfully reset prefix cache
+
+ 34%|███▎      | 344/1024 [15:14:38<31:36:59, 167.38s/it][A
+                                                         [A{'loss': 0.0445, 'grad_norm': 0.0027867467142641544, 'learning_rate': 1e-05, 'num_tokens': 282994036.0, 'completions/mean_length': 7148.7890625, 'completions/min_length': 252.0, 'completions/max_length': 15859.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 7148.7890625, 'completions/min_terminated_length': 252.0, 'completions/max_terminated_length': 15859.0, 'rewards/accuracy_reward/mean': 0.4921875, 'rewards/accuracy_reward/std': 0.5019033551216125, 'reward': 0.4921875, 'reward_std': 0.28511500358581543, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.0217401385307312, 'sampling/sampling_logp_difference/max': 3.583648204803467, 'sampling/importance_sampling_ratio/min': 0.027774186804890633, 'sampling/importance_sampling_ratio/mean': 0.9999145269393921, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.0214989855885506, 'clip_ratio/low_mean': 3.0735714062757324e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 1.0285018561262405e-06, 'clip_ratio/high_max': 4.114007424504962e-06, 'clip_ratio/region_mean': 3.176421569150989e-05, 'epoch': 0.32}
+
+ 34%|███▎      | 344/1024 [15:14:38<31:36:59, 167.38s/it][AINFO 12-02 15:28:11 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 15:28:11 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 15:28:11 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 15:28:11 [block_pool.py:292] Successfully reset prefix cache
+
+ 34%|███▎      | 345/1024 [15:17:02<30:16:26, 160.51s/it][A
+                                                         [A{'loss': 0.0251, 'grad_norm': 0.0024432060308754444, 'learning_rate': 1e-05, 'num_tokens': 283723605.0, 'completions/mean_length': 5532.1328125, 'completions/min_length': 467.0, 'completions/max_length': 16091.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 5532.1328125, 'completions/min_terminated_length': 467.0, 'completions/max_terminated_length': 16091.0, 'rewards/accuracy_reward/mean': 0.421875, 'rewards/accuracy_reward/std': 0.4957992732524872, 'reward': 0.421875, 'reward_std': 0.38717782497406006, 'frac_reward_zero_std': 0.1875, 'sampling/sampling_logp_difference/mean': 0.019281461834907532, 'sampling/sampling_logp_difference/max': 4.428175926208496, 'sampling/importance_sampling_ratio/min': 0.011936242692172527, 'sampling/importance_sampling_ratio/mean': 0.9999819993972778, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9303388148546219, 'clip_ratio/low_mean': 4.230772367463942e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 5.513276278179546e-06, 'clip_ratio/high_max': 1.6063933799159713e-05, 'clip_ratio/region_mean': 4.782100086231367e-05, 'epoch': 0.32}
+
+ 34%|███▎      | 345/1024 [15:17:02<30:16:26, 160.51s/it][AINFO 12-02 15:30:35 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 15:30:35 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 15:30:35 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 15:30:35 [block_pool.py:292] Successfully reset prefix cache
+
+ 34%|███▍      | 346/1024 [15:19:44<30:17:57, 160.88s/it][A
+                                                         [A{'loss': -0.0037, 'grad_norm': 0.0031446516513824463, 'learning_rate': 1e-05, 'num_tokens': 284617089.0, 'completions/mean_length': 6830.09375, 'completions/min_length': 503.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0390625, 'completions/mean_terminated_length': 6441.72314453125, 'completions/min_terminated_length': 503.0, 'completions/max_terminated_length': 15933.0, 'rewards/accuracy_reward/mean': 0.3671875, 'rewards/accuracy_reward/std': 0.4839322865009308, 'reward': 0.3671875, 'reward_std': 0.20911568403244019, 'frac_reward_zero_std': 0.5625, 'sampling/sampling_logp_difference/mean': 0.01985902711749077, 'sampling/sampling_logp_difference/max': 7.197423458099365, 'sampling/importance_sampling_ratio/min': 0.0007485119276680052, 'sampling/importance_sampling_ratio/mean': 0.9999873042106628, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9551377296447754, 'clip_ratio/low_mean': 1.5384349637770356e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 1.554673303871823e-06, 'clip_ratio/high_max': 6.218693215487292e-06, 'clip_ratio/region_mean': 1.6939022600581666e-05, 'epoch': 0.32}
+
+ 34%|███▍      | 346/1024 [15:19:44<30:17:57, 160.88s/it][AINFO 12-02 15:33:17 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 15:33:17 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 15:33:17 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 15:33:17 [block_pool.py:292] Successfully reset prefix cache
+
+ 34%|███▍      | 347/1024 [15:22:34<30:47:05, 163.70s/it][A
+                                                         [A{'loss': 0.0669, 'grad_norm': 0.0024617225863039494, 'learning_rate': 1e-05, 'num_tokens': 285475910.0, 'completions/mean_length': 6557.3515625, 'completions/min_length': 385.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 6401.37353515625, 'completions/min_terminated_length': 385.0, 'completions/max_terminated_length': 15613.0, 'rewards/accuracy_reward/mean': 0.390625, 'rewards/accuracy_reward/std': 0.4898075461387634, 'reward': 0.390625, 'reward_std': 0.2761683464050293, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.021138068288564682, 'sampling/sampling_logp_difference/max': 5.079075813293457, 'sampling/importance_sampling_ratio/min': 0.006225659977644682, 'sampling/importance_sampling_ratio/mean': 0.9999537467956543, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.0254710763692856, 'clip_ratio/low_mean': 3.287052913947264e-05, 'clip_ratio/low_min': 2.789369091260596e-06, 'clip_ratio/high_mean': 2.8712697712762747e-06, 'clip_ratio/high_max': 7.772906428726856e-06, 'clip_ratio/region_mean': 3.574179936549626e-05, 'epoch': 0.32}
+
+ 34%|███▍      | 347/1024 [15:22:34<30:47:05, 163.70s/it][AINFO 12-02 15:36:07 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 15:36:07 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 15:36:07 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 15:36:07 [block_pool.py:292] Successfully reset prefix cache
+
+ 34%|███▍      | 348/1024 [15:25:36<31:43:42, 168.97s/it][A
+                                                         [A{'loss': 0.0734, 'grad_norm': 0.0017496495274826884, 'learning_rate': 1e-05, 'num_tokens': 286439696.0, 'completions/mean_length': 7379.140625, 'completions/min_length': 1243.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 7088.6611328125, 'completions/min_terminated_length': 1243.0, 'completions/max_terminated_length': 15851.0, 'rewards/accuracy_reward/mean': 0.390625, 'rewards/accuracy_reward/std': 0.4898075461387634, 'reward': 0.390625, 'reward_std': 0.26538965106010437, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.021384600549936295, 'sampling/sampling_logp_difference/max': 5.000324726104736, 'sampling/importance_sampling_ratio/min': 0.006735759321600199, 'sampling/importance_sampling_ratio/mean': 0.9999358654022217, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9518962875008583, 'clip_ratio/low_mean': 6.333507008093875e-05, 'clip_ratio/low_min': 4.415712737682043e-06, 'clip_ratio/high_mean': 3.588538106669148e-06, 'clip_ratio/high_max': 1.0258745533064939e-05, 'clip_ratio/region_mean': 6.692360875604209e-05, 'epoch': 0.32}
+
+ 34%|███▍      | 348/1024 [15:25:36<31:43:42, 168.97s/it][AINFO 12-02 15:39:08 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 15:39:08 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 15:39:08 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 15:39:08 [block_pool.py:292] Successfully reset prefix cache
+
+ 34%|███▍      | 349/1024 [15:28:19<31:21:52, 167.28s/it][A
+                                                         [A{'loss': 0.0484, 'grad_norm': 0.0013999518705531955, 'learning_rate': 1e-05, 'num_tokens': 287226394.0, 'completions/mean_length': 5960.140625, 'completions/min_length': 833.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 5878.06298828125, 'completions/min_terminated_length': 833.0, 'completions/max_terminated_length': 16123.0, 'rewards/accuracy_reward/mean': 0.3515625, 'rewards/accuracy_reward/std': 0.4793342351913452, 'reward': 0.3515625, 'reward_std': 0.20175683498382568, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.01937047764658928, 'sampling/sampling_logp_difference/max': 11.71871566772461, 'sampling/importance_sampling_ratio/min': 8.140038517012727e-06, 'sampling/importance_sampling_ratio/mean': 0.9999549984931946, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9556702002882957, 'clip_ratio/low_mean': 2.854056094747648e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 0.0, 'clip_ratio/high_max': 0.0, 'clip_ratio/region_mean': 2.854056094747648e-05, 'epoch': 0.32}
+
+ 34%|███▍      | 349/1024 [15:28:19<31:21:52, 167.28s/it][AINFO 12-02 15:41:52 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 15:41:52 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 15:41:52 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 15:41:52 [block_pool.py:292] Successfully reset prefix cache
+
+ 34%|███▍      | 350/1024 [15:30:38<29:42:58, 158.72s/it][A
+                                                         [A{'loss': 0.0692, 'grad_norm': 0.004228786565363407, 'learning_rate': 1e-05, 'num_tokens': 287935952.0, 'completions/mean_length': 5387.546875, 'completions/min_length': 464.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 5300.96044921875, 'completions/min_terminated_length': 464.0, 'completions/max_terminated_length': 12561.0, 'rewards/accuracy_reward/mean': 0.5234375, 'rewards/accuracy_reward/std': 0.5014128684997559, 'reward': 0.5234375, 'reward_std': 0.29378965497016907, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.020441649481654167, 'sampling/sampling_logp_difference/max': 5.121629238128662, 'sampling/importance_sampling_ratio/min': 0.005966294556856155, 'sampling/importance_sampling_ratio/mean': 1.0000360012054443, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.95712860673666, 'clip_ratio/low_mean': 3.610323426528339e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 2.0987517928006127e-06, 'clip_ratio/high_max': 8.395007171202451e-06, 'clip_ratio/region_mean': 3.820198628545768e-05, 'epoch': 0.32}
+
+ 34%|███▍      | 350/1024 [15:30:38<29:42:58, 158.72s/it][AINFO 12-02 15:44:10 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 15:44:10 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 15:44:10 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 15:44:11 [block_pool.py:292] Successfully reset prefix cache
+
+ 34%|███▍      | 351/1024 [15:33:17<29:42:07, 158.88s/it][A
+                                                         [A{'loss': 0.0589, 'grad_norm': 0.0029834613669663668, 'learning_rate': 1e-05, 'num_tokens': 288696000.0, 'completions/mean_length': 5799.625, 'completions/min_length': 415.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 5716.283203125, 'completions/min_terminated_length': 415.0, 'completions/max_terminated_length': 15957.0, 'rewards/accuracy_reward/mean': 0.4921875, 'rewards/accuracy_reward/std': 0.5019033551216125, 'reward': 0.4921875, 'reward_std': 0.3884710967540741, 'frac_reward_zero_std': 0.1875, 'sampling/sampling_logp_difference/mean': 0.021189026534557343, 'sampling/sampling_logp_difference/max': 6.780930519104004, 'sampling/importance_sampling_ratio/min': 0.0011352180736139417, 'sampling/importance_sampling_ratio/mean': 0.9999950528144836, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9457403644919395, 'clip_ratio/low_mean': 2.673440690159623e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 3.1399092677020235e-06, 'clip_ratio/high_max': 1.2559637070808094e-05, 'clip_ratio/region_mean': 2.9874316624045605e-05, 'epoch': 0.32}
+
+ 34%|███▍      | 351/1024 [15:33:17<29:42:07, 158.88s/it][AINFO 12-02 15:46:50 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 15:46:50 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 15:46:50 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 15:46:50 [block_pool.py:292] Successfully reset prefix cache
+
+ 34%|███▍      | 352/1024 [15:36:10<30:28:13, 163.23s/it][A
+                                                         [A{'loss': 0.0874, 'grad_norm': 0.002617151942104101, 'learning_rate': 1e-05, 'num_tokens': 289618904.0, 'completions/mean_length': 7058.6875, 'completions/min_length': 799.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 6757.87060546875, 'completions/min_terminated_length': 799.0, 'completions/max_terminated_length': 16148.0, 'rewards/accuracy_reward/mean': 0.3515625, 'rewards/accuracy_reward/std': 0.4793342351913452, 'reward': 0.3515625, 'reward_std': 0.28353992104530334, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.019699860364198685, 'sampling/sampling_logp_difference/max': 6.54404354095459, 'sampling/importance_sampling_ratio/min': 0.001438659499399364, 'sampling/importance_sampling_ratio/mean': 0.9999942779541016, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8782663866877556, 'clip_ratio/low_mean': 3.849920358334202e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 1.5629689187335316e-06, 'clip_ratio/high_max': 6.2518756749341264e-06, 'clip_ratio/region_mean': 4.0062172047328204e-05, 'epoch': 0.32}
+
+ 34%|███▍      | 352/1024 [15:36:10<30:28:13, 163.23s/it][AINFO 12-02 15:49:43 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 15:49:43 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 15:49:43 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 15:49:43 [block_pool.py:292] Successfully reset prefix cache
+
+ 34%|███▍      | 353/1024 [15:38:45<29:57:11, 160.70s/it][A
+                                                         [A{'loss': 0.044, 'grad_norm': 0.0021503251045942307, 'learning_rate': 1e-05, 'num_tokens': 290484378.0, 'completions/mean_length': 6609.953125, 'completions/min_length': 500.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 6454.81005859375, 'completions/min_terminated_length': 500.0, 'completions/max_terminated_length': 15519.0, 'rewards/accuracy_reward/mean': 0.3671875, 'rewards/accuracy_reward/std': 0.4839322865009308, 'reward': 0.3671875, 'reward_std': 0.35324612259864807, 'frac_reward_zero_std': 0.1875, 'sampling/sampling_logp_difference/mean': 0.0200796015560627, 'sampling/sampling_logp_difference/max': 9.817559242248535, 'sampling/importance_sampling_ratio/min': 5.448641240946017e-05, 'sampling/importance_sampling_ratio/mean': 0.9999619722366333, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8895087689161301, 'clip_ratio/low_mean': 6.639697721766424e-05, 'clip_ratio/low_min': 1.0295151696482208e-05, 'clip_ratio/high_mean': 4.519783715295489e-06, 'clip_ratio/high_max': 1.8079134861181956e-05, 'clip_ratio/region_mean': 7.091676206982811e-05, 'epoch': 0.32}
+
+ 34%|███▍      | 353/1024 [15:38:45<29:57:11, 160.70s/it][AINFO 12-02 15:52:18 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 15:52:18 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 15:52:18 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 15:52:18 [block_pool.py:292] Successfully reset prefix cache
+
+ 35%|███▍      | 354/1024 [15:41:44<30:55:56, 166.20s/it][A
+                                                         [A{'loss': 0.0895, 'grad_norm': 0.0023925534915179014, 'learning_rate': 1e-05, 'num_tokens': 291512393.0, 'completions/mean_length': 7872.4921875, 'completions/min_length': 328.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.046875, 'completions/mean_terminated_length': 7453.89306640625, 'completions/min_terminated_length': 328.0, 'completions/max_terminated_length': 16284.0, 'rewards/accuracy_reward/mean': 0.34375, 'rewards/accuracy_reward/std': 0.47682511806488037, 'reward': 0.34375, 'reward_std': 0.3763991594314575, 'frac_reward_zero_std': 0.125, 'sampling/sampling_logp_difference/mean': 0.020358648151159286, 'sampling/sampling_logp_difference/max': 7.7546706199646, 'sampling/importance_sampling_ratio/min': 0.0004287353658583015, 'sampling/importance_sampling_ratio/mean': 0.9999232292175293, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9183534607291222, 'clip_ratio/low_mean': 6.141278026916552e-05, 'clip_ratio/low_min': 1.333249815616e-05, 'clip_ratio/high_mean': 0.0, 'clip_ratio/high_max': 0.0, 'clip_ratio/region_mean': 6.141278026916552e-05, 'epoch': 0.33}
+
+ 35%|███▍      | 354/1024 [15:41:44<30:55:56, 166.20s/it][AINFO 12-02 15:55:17 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 15:55:17 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 15:55:17 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 15:55:17 [block_pool.py:292] Successfully reset prefix cache
+
+ 35%|███▍      | 355/1024 [15:44:43<31:35:20, 169.99s/it][A
+                                                         [A{'loss': 0.0642, 'grad_norm': 0.0018709113355726004, 'learning_rate': 1e-05, 'num_tokens': 292380390.0, 'completions/mean_length': 6605.6640625, 'completions/min_length': 269.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 6290.23388671875, 'completions/min_terminated_length': 269.0, 'completions/max_terminated_length': 15485.0, 'rewards/accuracy_reward/mean': 0.515625, 'rewards/accuracy_reward/std': 0.5017194747924805, 'reward': 0.515625, 'reward_std': 0.26303553581237793, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.02150166593492031, 'sampling/sampling_logp_difference/max': 21.19785499572754, 'sampling/importance_sampling_ratio/min': 6.221406168016586e-10, 'sampling/importance_sampling_ratio/mean': 0.9999732375144958, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9602678120136261, 'clip_ratio/low_mean': 1.995230707052542e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 3.7178592720010784e-06, 'clip_ratio/high_max': 1.0912609013757901e-05, 'clip_ratio/region_mean': 2.367016588777915e-05, 'epoch': 0.33}
+
+ 35%|███▍      | 355/1024 [15:44:43<31:35:20, 169.99s/it][AINFO 12-02 15:58:16 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 15:58:16 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 15:58:16 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 15:58:16 [block_pool.py:292] Successfully reset prefix cache
+
+ 35%|███▍      | 356/1024 [15:47:00<29:41:10, 159.99s/it][A
+                                                         [A{'loss': 0.0502, 'grad_norm': 0.0017649955116212368, 'learning_rate': 1e-05, 'num_tokens': 293255287.0, 'completions/mean_length': 6677.8828125, 'completions/min_length': 754.0, 'completions/max_length': 13477.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 6677.8828125, 'completions/min_terminated_length': 754.0, 'completions/max_terminated_length': 13477.0, 'rewards/accuracy_reward/mean': 0.3203125, 'rewards/accuracy_reward/std': 0.4684300124645233, 'reward': 0.3203125, 'reward_std': 0.2845958471298218, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.020375655964016914, 'sampling/sampling_logp_difference/max': 5.908604621887207, 'sampling/importance_sampling_ratio/min': 0.0027159738820046186, 'sampling/importance_sampling_ratio/mean': 0.9998878240585327, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.001693107187748, 'clip_ratio/low_mean': 5.168271604816255e-05, 'clip_ratio/low_min': 7.731559890089557e-06, 'clip_ratio/high_mean': 6.279054105107207e-06, 'clip_ratio/high_max': 2.202200403189636e-05, 'clip_ratio/region_mean': 5.796177038064343e-05, 'epoch': 0.33}
+
+ 35%|███▍      | 356/1024 [15:47:00<29:41:10, 159.99s/it][AINFO 12-02 16:00:32 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 16:00:32 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 16:00:32 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 16:00:32 [block_pool.py:292] Successfully reset prefix cache
+
+ 35%|███▍      | 357/1024 [15:49:41<29:44:16, 160.50s/it][A
+                                                         [A{'loss': 0.0637, 'grad_norm': 0.0024816791992634535, 'learning_rate': 1e-05, 'num_tokens': 294069184.0, 'completions/mean_length': 6210.6953125, 'completions/min_length': 870.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 6049.21484375, 'completions/min_terminated_length': 870.0, 'completions/max_terminated_length': 15925.0, 'rewards/accuracy_reward/mean': 0.4140625, 'rewards/accuracy_reward/std': 0.49449479579925537, 'reward': 0.4140625, 'reward_std': 0.2845958471298218, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.021009165793657303, 'sampling/sampling_logp_difference/max': 5.342665195465088, 'sampling/importance_sampling_ratio/min': 0.0047831060364842415, 'sampling/importance_sampling_ratio/mean': 1.0000953674316406, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9842480793595314, 'clip_ratio/low_mean': 2.7612236522145395e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 2.223324372607749e-06, 'clip_ratio/high_max': 5.7686097534315195e-06, 'clip_ratio/region_mean': 2.9835560894753144e-05, 'epoch': 0.33}
+
+ 35%|███▍      | 357/1024 [15:49:41<29:44:16, 160.50s/it][AINFO 12-02 16:03:14 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 16:03:14 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 16:03:14 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 16:03:14 [block_pool.py:292] Successfully reset prefix cache
+
+ 35%|███▍      | 358/1024 [15:52:28<30:01:44, 162.32s/it][A
+                                                         [A{'loss': 0.1096, 'grad_norm': 0.0030787813011556864, 'learning_rate': 1e-05, 'num_tokens': 294969111.0, 'completions/mean_length': 6876.0546875, 'completions/min_length': 206.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 6725.13525390625, 'completions/min_terminated_length': 206.0, 'completions/max_terminated_length': 16085.0, 'rewards/accuracy_reward/mean': 0.4921875, 'rewards/accuracy_reward/std': 0.5019033551216125, 'reward': 0.4921875, 'reward_std': 0.3514111638069153, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.019419874995946884, 'sampling/sampling_logp_difference/max': 5.610518932342529, 'sampling/importance_sampling_ratio/min': 0.0036591701209545135, 'sampling/importance_sampling_ratio/mean': 0.9999387264251709, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8680268228054047, 'clip_ratio/low_mean': 4.299241186345171e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 1.2711002455034759e-06, 'clip_ratio/high_max': 5.0844009820139036e-06, 'clip_ratio/region_mean': 4.426351074471313e-05, 'epoch': 0.33}
+
+ 35%|███▍      | 358/1024 [15:52:28<30:01:44, 162.32s/it][AINFO 12-02 16:06:01 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 16:06:01 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 16:06:01 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 16:06:01 [block_pool.py:292] Successfully reset prefix cache
+
+ 35%|███▌      | 359/1024 [15:55:14<30:11:34, 163.45s/it][A
+                                                         [A{'loss': 0.1352, 'grad_norm': 0.0027940638829022646, 'learning_rate': 1e-05, 'num_tokens': 295894682.0, 'completions/mean_length': 7079.7734375, 'completions/min_length': 973.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 6932.087890625, 'completions/min_terminated_length': 973.0, 'completions/max_terminated_length': 16154.0, 'rewards/accuracy_reward/mean': 0.4140625, 'rewards/accuracy_reward/std': 0.49449479579925537, 'reward': 0.4140625, 'reward_std': 0.40319663286209106, 'frac_reward_zero_std': 0.1875, 'sampling/sampling_logp_difference/mean': 0.021608728915452957, 'sampling/sampling_logp_difference/max': 7.999777793884277, 'sampling/importance_sampling_ratio/min': 0.00033553718822076917, 'sampling/importance_sampling_ratio/mean': 0.999890923500061, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.0033101588487625, 'clip_ratio/low_mean': 3.3445195754211454e-05, 'clip_ratio/low_min': 3.1955414669937454e-06, 'clip_ratio/high_mean': 1.319957732448529e-06, 'clip_ratio/high_max': 5.279830929794116e-06, 'clip_ratio/region_mean': 3.476515314559947e-05, 'epoch': 0.33}
+
+ 35%|███▌      | 359/1024 [15:55:14<30:11:34, 163.45s/it][AINFO 12-02 16:08:47 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 16:08:47 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 16:08:47 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 16:08:47 [block_pool.py:292] Successfully reset prefix cache
+
+ 35%|███▌      | 360/1024 [15:58:07<30:40:50, 166.34s/it][A
+                                                         [A{'loss': 0.0479, 'grad_norm': 0.0021709369029849768, 'learning_rate': 1e-05, 'num_tokens': 296744216.0, 'completions/mean_length': 6487.421875, 'completions/min_length': 637.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 6249.904296875, 'completions/min_terminated_length': 637.0, 'completions/max_terminated_length': 14374.0, 'rewards/accuracy_reward/mean': 0.4296875, 'rewards/accuracy_reward/std': 0.4969765841960907, 'reward': 0.4296875, 'reward_std': 0.31800350546836853, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.02024281956255436, 'sampling/sampling_logp_difference/max': 8.9999418258667, 'sampling/importance_sampling_ratio/min': 0.00012341697583906353, 'sampling/importance_sampling_ratio/mean': 1.0000174045562744, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9404204189777374, 'clip_ratio/low_mean': 3.935158406420669e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 1.0135573802472209e-06, 'clip_ratio/high_max': 4.0542295209888835e-06, 'clip_ratio/region_mean': 4.0365141785514425e-05, 'epoch': 0.33}
+
+ 35%|███▌      | 360/1024 [15:58:07<30:40:50, 166.34s/it][AINFO 12-02 16:11:40 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 16:11:40 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 16:11:40 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 16:11:40 [block_pool.py:292] Successfully reset prefix cache
+
+ 35%|███▌      | 361/1024 [16:00:56<30:46:27, 167.10s/it][A
+                                                         [A{'loss': -0.0018, 'grad_norm': 0.0018110686214640737, 'learning_rate': 1e-05, 'num_tokens': 297617937.0, 'completions/mean_length': 6668.1953125, 'completions/min_length': 567.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0390625, 'completions/mean_terminated_length': 6273.24365234375, 'completions/min_terminated_length': 567.0, 'completions/max_terminated_length': 14584.0, 'rewards/accuracy_reward/mean': 0.4765625, 'rewards/accuracy_reward/std': 0.5014128684997559, 'reward': 0.4765625, 'reward_std': 0.22673210501670837, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.01934785582125187, 'sampling/sampling_logp_difference/max': 7.874990940093994, 'sampling/importance_sampling_ratio/min': 0.0003801324055530131, 'sampling/importance_sampling_ratio/mean': 0.999983549118042, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8671490699052811, 'clip_ratio/low_mean': 2.0490186102506414e-05, 'clip_ratio/low_min': 2.8498473056970397e-06, 'clip_ratio/high_mean': 6.103540727053769e-06, 'clip_ratio/high_max': 2.4414162908215076e-05, 'clip_ratio/region_mean': 2.6593726602186507e-05, 'epoch': 0.33}
+
+ 35%|███▌      | 361/1024 [16:00:56<30:46:27, 167.10s/it][AINFO 12-02 16:14:29 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 16:14:29 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 16:14:29 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 16:14:29 [block_pool.py:292] Successfully reset prefix cache
+
+ 35%|███▌      | 362/1024 [16:04:01<31:41:43, 172.36s/it][A
+                                                         [A{'loss': 0.045, 'grad_norm': 0.0014365602983161807, 'learning_rate': 1e-05, 'num_tokens': 298736304.0, 'completions/mean_length': 8579.9921875, 'completions/min_length': 363.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0703125, 'completions/mean_terminated_length': 7989.7734375, 'completions/min_terminated_length': 363.0, 'completions/max_terminated_length': 15979.0, 'rewards/accuracy_reward/mean': 0.1953125, 'rewards/accuracy_reward/std': 0.3979988098144531, 'reward': 0.1953125, 'reward_std': 0.1999218761920929, 'frac_reward_zero_std': 0.5625, 'sampling/sampling_logp_difference/mean': 0.021067796275019646, 'sampling/sampling_logp_difference/max': 6.5686354637146, 'sampling/importance_sampling_ratio/min': 0.0014037116197869182, 'sampling/importance_sampling_ratio/mean': 0.9999146461486816, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.0337364450097084, 'clip_ratio/low_mean': 2.4539695857583865e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 2.1651780457432324e-06, 'clip_ratio/high_max': 8.66071218297293e-06, 'clip_ratio/region_mean': 2.6704873903327098e-05, 'epoch': 0.33}
+
+ 35%|███▌      | 362/1024 [16:04:01<31:41:43, 172.36s/it][AINFO 12-02 16:17:33 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 16:17:33 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 16:17:33 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 16:17:33 [block_pool.py:292] Successfully reset prefix cache
+
+ 35%|███▌      | 363/1024 [16:06:50<31:30:03, 171.56s/it][A
+                                                         [A{'loss': 0.0862, 'grad_norm': 0.003008107887580991, 'learning_rate': 1e-05, 'num_tokens': 299653249.0, 'completions/mean_length': 7000.8203125, 'completions/min_length': 456.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 6926.93701171875, 'completions/min_terminated_length': 456.0, 'completions/max_terminated_length': 16368.0, 'rewards/accuracy_reward/mean': 0.453125, 'rewards/accuracy_reward/std': 0.4997538626194, 'reward': 0.453125, 'reward_std': 0.3322049677371979, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.020022090524435043, 'sampling/sampling_logp_difference/max': 5.999907970428467, 'sampling/importance_sampling_ratio/min': 0.002478980226442218, 'sampling/importance_sampling_ratio/mean': 0.9999739527702332, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8918163478374481, 'clip_ratio/low_mean': 5.063434127805522e-05, 'clip_ratio/low_min': 9.66116931522265e-06, 'clip_ratio/high_mean': 1.937200920565374e-06, 'clip_ratio/high_max': 7.748803682261496e-06, 'clip_ratio/region_mean': 5.257154271021136e-05, 'epoch': 0.33}
+
+ 35%|███▌      | 363/1024 [16:06:50<31:30:03, 171.56s/it][AINFO 12-02 16:20:23 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 16:20:23 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 16:20:23 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 16:20:23 [block_pool.py:292] Successfully reset prefix cache
+
+ 36%|███▌      | 364/1024 [16:09:54<32:07:10, 175.20s/it][A
+                                                         [A{'loss': 0.0138, 'grad_norm': 0.0010370119707658887, 'learning_rate': 1e-05, 'num_tokens': 300608099.0, 'completions/mean_length': 7319.578125, 'completions/min_length': 1974.0, 'completions/max_length': 16375.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 7319.578125, 'completions/min_terminated_length': 1974.0, 'completions/max_terminated_length': 16375.0, 'rewards/accuracy_reward/mean': 0.4609375, 'rewards/accuracy_reward/std': 0.5004304051399231, 'reward': 0.4609375, 'reward_std': 0.1412346363067627, 'frac_reward_zero_std': 0.6875, 'sampling/sampling_logp_difference/mean': 0.019386455416679382, 'sampling/sampling_logp_difference/max': 9.015096664428711, 'sampling/importance_sampling_ratio/min': 0.00012156071898061782, 'sampling/importance_sampling_ratio/mean': 0.9999158382415771, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9145128801465034, 'clip_ratio/low_mean': 8.800596447144926e-06, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 3.7608381262543844e-06, 'clip_ratio/high_max': 1.5043352505017538e-05, 'clip_ratio/region_mean': 1.2561434687086148e-05, 'epoch': 0.33}
+
+ 36%|███▌      | 364/1024 [16:09:54<32:07:10, 175.20s/it][AINFO 12-02 16:23:27 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 16:23:27 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 16:23:27 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 16:23:27 [block_pool.py:292] Successfully reset prefix cache
+
+ 36%|███▌      | 365/1024 [16:12:38<31:27:43, 171.87s/it][A
+                                                         [A{'loss': 0.0621, 'grad_norm': 0.0021569218952208757, 'learning_rate': 1e-05, 'num_tokens': 301516535.0, 'completions/mean_length': 6956.90625, 'completions/min_length': 769.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 6882.67724609375, 'completions/min_terminated_length': 769.0, 'completions/max_terminated_length': 16316.0, 'rewards/accuracy_reward/mean': 0.4765625, 'rewards/accuracy_reward/std': 0.5014128684997559, 'reward': 0.4765625, 'reward_std': 0.23462772369384766, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.020638462156057358, 'sampling/sampling_logp_difference/max': 4.121843338012695, 'sampling/importance_sampling_ratio/min': 0.01621459797024727, 'sampling/importance_sampling_ratio/mean': 0.9999587535858154, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9679212644696236, 'clip_ratio/low_mean': 2.2494899667435675e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 2.3974576492946653e-06, 'clip_ratio/high_max': 9.589830597178661e-06, 'clip_ratio/region_mean': 2.4892357714634272e-05, 'epoch': 0.34}
+
+ 36%|███▌      | 365/1024 [16:12:38<31:27:43, 171.87s/it][AINFO 12-02 16:26:11 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 16:26:11 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 16:26:11 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 16:26:11 [block_pool.py:292] Successfully reset prefix cache
+
+ 36%|███▌      | 366/1024 [16:15:45<32:15:08, 176.46s/it][A
+                                                         [A{'loss': 0.0329, 'grad_norm': 0.0019900640472769737, 'learning_rate': 1e-05, 'num_tokens': 302422120.0, 'completions/mean_length': 6933.1953125, 'completions/min_length': 979.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 6706.37646484375, 'completions/min_terminated_length': 979.0, 'completions/max_terminated_length': 16343.0, 'rewards/accuracy_reward/mean': 0.4921875, 'rewards/accuracy_reward/std': 0.5019033551216125, 'reward': 0.4921875, 'reward_std': 0.22908620536327362, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.020782412961125374, 'sampling/sampling_logp_difference/max': 18.729053497314453, 'sampling/importance_sampling_ratio/min': 7.346414143682978e-09, 'sampling/importance_sampling_ratio/mean': 0.9999517202377319, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9610472694039345, 'clip_ratio/low_mean': 2.334770033485256e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 2.989332756442309e-06, 'clip_ratio/high_max': 1.1957331025769236e-05, 'clip_ratio/region_mean': 2.6337033204981708e-05, 'epoch': 0.34}
+
+ 36%|███▌      | 366/1024 [16:15:45<32:15:08, 176.46s/it][AINFO 12-02 16:29:18 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 16:29:18 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 16:29:18 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 16:29:18 [block_pool.py:292] Successfully reset prefix cache
+
+ 36%|███▌      | 367/1024 [16:18:27<31:23:51, 172.04s/it][A
+                                                         [A{'loss': 0.0264, 'grad_norm': 0.001954294042661786, 'learning_rate': 1e-05, 'num_tokens': 303299402.0, 'completions/mean_length': 6711.640625, 'completions/min_length': 814.0, 'completions/max_length': 15799.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 6711.640625, 'completions/min_terminated_length': 814.0, 'completions/max_terminated_length': 15799.0, 'rewards/accuracy_reward/mean': 0.4765625, 'rewards/accuracy_reward/std': 0.5014128684997559, 'reward': 0.4765625, 'reward_std': 0.2856517732143402, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.018188728019595146, 'sampling/sampling_logp_difference/max': 5.943129062652588, 'sampling/importance_sampling_ratio/min': 0.002623806707561016, 'sampling/importance_sampling_ratio/mean': 1.0000114440917969, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8035724982619286, 'clip_ratio/low_mean': 2.5385876426753384e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 4.091297739705624e-06, 'clip_ratio/high_max': 1.6365190958822495e-05, 'clip_ratio/region_mean': 2.9477173825398495e-05, 'epoch': 0.34}
+
+ 36%|███▌      | 367/1024 [16:18:27<31:23:51, 172.04s/it][AINFO 12-02 16:32:00 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 16:32:00 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 16:32:00 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 16:32:00 [block_pool.py:292] Successfully reset prefix cache
+
+ 36%|███▌      | 368/1024 [16:21:24<31:36:39, 173.48s/it][A
+                                                         [A{'loss': 0.0187, 'grad_norm': 0.0022571857552975416, 'learning_rate': 1e-05, 'num_tokens': 304210412.0, 'completions/mean_length': 6977.890625, 'completions/min_length': 737.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 6674.4677734375, 'completions/min_terminated_length': 737.0, 'completions/max_terminated_length': 15980.0, 'rewards/accuracy_reward/mean': 0.4375, 'rewards/accuracy_reward/std': 0.49802759289741516, 'reward': 0.4375, 'reward_std': 0.19568344950675964, 'frac_reward_zero_std': 0.5625, 'sampling/sampling_logp_difference/mean': 0.021196123212575912, 'sampling/sampling_logp_difference/max': 12.110552787780762, 'sampling/importance_sampling_ratio/min': 5.501153282239102e-06, 'sampling/importance_sampling_ratio/mean': 0.9999645948410034, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9545647650957108, 'clip_ratio/low_mean': 3.7187305906627444e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 2.158340237201628e-06, 'clip_ratio/high_max': 8.633360948806512e-06, 'clip_ratio/region_mean': 3.9345645916455396e-05, 'epoch': 0.34}
+
+ 36%|███▌      | 368/1024 [16:21:24<31:36:39, 173.48s/it][AINFO 12-02 16:34:57 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 16:34:57 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 16:34:57 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 16:34:57 [block_pool.py:292] Successfully reset prefix cache
+
+ 36%|███▌      | 369/1024 [16:24:02<30:42:40, 168.79s/it][A
+                                                         [A{'loss': 0.0897, 'grad_norm': 0.003541936632245779, 'learning_rate': 1e-05, 'num_tokens': 304989015.0, 'completions/mean_length': 5952.8359375, 'completions/min_length': 651.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0546875, 'completions/mean_terminated_length': 5349.3798828125, 'completions/min_terminated_length': 651.0, 'completions/max_terminated_length': 16087.0, 'rewards/accuracy_reward/mean': 0.4453125, 'rewards/accuracy_reward/std': 0.4989531338214874, 'reward': 0.4453125, 'reward_std': 0.3022122383117676, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.018978029489517212, 'sampling/sampling_logp_difference/max': 6.261515140533447, 'sampling/importance_sampling_ratio/min': 0.0019083521328866482, 'sampling/importance_sampling_ratio/mean': 0.9998842477798462, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.846152663230896, 'clip_ratio/low_mean': 2.7975384682576987e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 3.0493761187244672e-06, 'clip_ratio/high_max': 1.2197504474897869e-05, 'clip_ratio/region_mean': 3.1024760801301454e-05, 'epoch': 0.34}
+
+ 36%|███▌      | 369/1024 [16:24:02<30:42:40, 168.79s/it][AINFO 12-02 16:37:34 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 16:37:34 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 16:37:34 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 16:37:34 [block_pool.py:292] Successfully reset prefix cache
+
+ 36%|███▌      | 370/1024 [16:27:00<31:12:15, 171.77s/it][A
+                                                         [A{'loss': 0.0668, 'grad_norm': 0.0023713603150099516, 'learning_rate': 1e-05, 'num_tokens': 306032054.0, 'completions/mean_length': 7976.9296875, 'completions/min_length': 514.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0390625, 'completions/mean_terminated_length': 7635.1787109375, 'completions/min_terminated_length': 514.0, 'completions/max_terminated_length': 16098.0, 'rewards/accuracy_reward/mean': 0.3046875, 'rewards/accuracy_reward/std': 0.46208351850509644, 'reward': 0.3046875, 'reward_std': 0.2527809143066406, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.0217706598341465, 'sampling/sampling_logp_difference/max': 14.937498092651367, 'sampling/importance_sampling_ratio/min': 3.2563195873080986e-07, 'sampling/importance_sampling_ratio/mean': 1.0000019073486328, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9827005565166473, 'clip_ratio/low_mean': 6.290217379500973e-05, 'clip_ratio/low_min': 1.226112590302364e-05, 'clip_ratio/high_mean': 2.9314706466720963e-06, 'clip_ratio/high_max': 1.1725882586688385e-05, 'clip_ratio/region_mean': 6.583364438483841e-05, 'epoch': 0.34}
+
+ 36%|███▌      | 370/1024 [16:27:00<31:12:15, 171.77s/it][AINFO 12-02 16:40:33 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 16:40:33 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 16:40:33 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 16:40:33 [block_pool.py:292] Successfully reset prefix cache
+
+ 36%|███▌      | 371/1024 [16:30:02<31:41:32, 174.72s/it][A
+                                                         [A{'loss': 0.162, 'grad_norm': 0.0023132229689508677, 'learning_rate': 1e-05, 'num_tokens': 306960599.0, 'completions/mean_length': 7117.8828125, 'completions/min_length': 1314.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 6895.49609375, 'completions/min_terminated_length': 1314.0, 'completions/max_terminated_length': 16369.0, 'rewards/accuracy_reward/mean': 0.515625, 'rewards/accuracy_reward/std': 0.5017194747924805, 'reward': 0.515625, 'reward_std': 0.34822866320610046, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.018669119104743004, 'sampling/sampling_logp_difference/max': 7.2168169021606445, 'sampling/importance_sampling_ratio/min': 0.0007341355667449534, 'sampling/importance_sampling_ratio/mean': 0.9999181032180786, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8897347301244736, 'clip_ratio/low_mean': 4.01184702241153e-05, 'clip_ratio/low_min': 1.341508686891757e-05, 'clip_ratio/high_mean': 7.721868257704045e-06, 'clip_ratio/high_max': 2.3902987095425487e-05, 'clip_ratio/region_mean': 4.784033922078379e-05, 'epoch': 0.34}
+
+ 36%|███▌      | 371/1024 [16:30:02<31:41:32, 174.72s/it][AINFO 12-02 16:43:35 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 16:43:35 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 16:43:35 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 16:43:35 [block_pool.py:292] Successfully reset prefix cache
+
+ 36%|███▋      | 372/1024 [16:32:46<31:03:05, 171.45s/it][A
+                                                         [A{'loss': 0.0754, 'grad_norm': 0.0030236958991736174, 'learning_rate': 1e-05, 'num_tokens': 307873100.0, 'completions/mean_length': 6971.0390625, 'completions/min_length': 871.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 6745.12841796875, 'completions/min_terminated_length': 871.0, 'completions/max_terminated_length': 15995.0, 'rewards/accuracy_reward/mean': 0.3359375, 'rewards/accuracy_reward/std': 0.47417303919792175, 'reward': 0.3359375, 'reward_std': 0.34245961904525757, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.022024717181921005, 'sampling/sampling_logp_difference/max': 4.525454521179199, 'sampling/importance_sampling_ratio/min': 0.01082979142665863, 'sampling/importance_sampling_ratio/mean': 1.0000091791152954, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.0919678956270218, 'clip_ratio/low_mean': 4.9660218792269006e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 1.0928101801255252e-06, 'clip_ratio/high_max': 4.371240720502101e-06, 'clip_ratio/region_mean': 5.075302897239453e-05, 'epoch': 0.34}
+
+ 36%|███▋      | 372/1024 [16:32:46<31:03:05, 171.45s/it][AINFO 12-02 16:46:19 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 16:46:19 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 16:46:19 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 16:46:19 [block_pool.py:292] Successfully reset prefix cache
+
+ 36%|███▋      | 373/1024 [16:35:29<30:33:56, 169.03s/it][A
+                                                         [A{'loss': 0.0542, 'grad_norm': 0.0018919071881100535, 'learning_rate': 1e-05, 'num_tokens': 308804876.0, 'completions/mean_length': 7120.0, 'completions/min_length': 1685.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 6897.66455078125, 'completions/min_terminated_length': 1685.0, 'completions/max_terminated_length': 14442.0, 'rewards/accuracy_reward/mean': 0.28125, 'rewards/accuracy_reward/std': 0.4513758420944214, 'reward': 0.28125, 'reward_std': 0.2522490322589874, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.022177904844284058, 'sampling/sampling_logp_difference/max': 6.747039794921875, 'sampling/importance_sampling_ratio/min': 0.0011743507348001003, 'sampling/importance_sampling_ratio/mean': 0.9999696612358093, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.0812252908945084, 'clip_ratio/low_mean': 3.061858558339736e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 1.0854085985556594e-06, 'clip_ratio/high_max': 4.341634394222638e-06, 'clip_ratio/region_mean': 3.170399429563986e-05, 'epoch': 0.34}
+
+ 36%|███▋      | 373/1024 [16:35:29<30:33:56, 169.03s/it][AINFO 12-02 16:49:02 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 16:49:02 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 16:49:02 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 16:49:02 [block_pool.py:292] Successfully reset prefix cache
+
+ 37%|███▋      | 374/1024 [16:38:29<31:07:03, 172.34s/it][A
+                                                         [A{'loss': 0.0195, 'grad_norm': 0.001848200336098671, 'learning_rate': 1e-05, 'num_tokens': 309762603.0, 'completions/mean_length': 7344.5546875, 'completions/min_length': 127.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0390625, 'completions/mean_terminated_length': 6977.09716796875, 'completions/min_terminated_length': 127.0, 'completions/max_terminated_length': 15814.0, 'rewards/accuracy_reward/mean': 0.4296875, 'rewards/accuracy_reward/std': 0.4969765841960907, 'reward': 0.4296875, 'reward_std': 0.2188364714384079, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.02034556306898594, 'sampling/sampling_logp_difference/max': 8.1235933303833, 'sampling/importance_sampling_ratio/min': 0.0002964614541269839, 'sampling/importance_sampling_ratio/mean': 0.9999948143959045, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9340410158038139, 'clip_ratio/low_mean': 1.3996559573570266e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 1.1549691407708451e-06, 'clip_ratio/high_max': 4.6198765630833805e-06, 'clip_ratio/region_mean': 1.5151528714341111e-05, 'epoch': 0.34}
+
+ 37%|███▋      | 374/1024 [16:38:29<31:07:03, 172.34s/it][AINFO 12-02 16:52:02 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 16:52:02 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 16:52:02 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 16:52:02 [block_pool.py:292] Successfully reset prefix cache
+[OpenTinker] 2025-12-02 16:53:39,834 - math_verify.grader - WARNING - Timeout during comparison
+
+ 37%|███▋      | 375/1024 [16:41:03<30:03:15, 166.71s/it][A
+                                                         [A{'loss': 0.0581, 'grad_norm': 0.0012123315827921033, 'learning_rate': 1e-05, 'num_tokens': 310628230.0, 'completions/mean_length': 6615.6484375, 'completions/min_length': 116.0, 'completions/max_length': 15244.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 6615.6484375, 'completions/min_terminated_length': 116.0, 'completions/max_terminated_length': 15244.0, 'rewards/accuracy_reward/mean': 0.4296875, 'rewards/accuracy_reward/std': 0.4969765841960907, 'reward': 0.4296875, 'reward_std': 0.2948455810546875, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.020877305418252945, 'sampling/sampling_logp_difference/max': 10.562370300292969, 'sampling/importance_sampling_ratio/min': 2.587145718280226e-05, 'sampling/importance_sampling_ratio/mean': 0.9999868869781494, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.971637412905693, 'clip_ratio/low_mean': 2.4544106395296694e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 3.478433995951491e-06, 'clip_ratio/high_max': 1.3913735983805964e-05, 'clip_ratio/region_mean': 2.8022539936500834e-05, 'epoch': 0.34}
+
+ 37%|███▋      | 375/1024 [16:41:03<30:03:15, 166.71s/it][AINFO 12-02 16:54:36 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 16:54:36 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 16:54:36 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 16:54:36 [block_pool.py:292] Successfully reset prefix cache
+
+ 37%|███▋      | 376/1024 [16:43:43<29:40:28, 164.86s/it][A
+                                                         [A{'loss': 0.086, 'grad_norm': 0.002646032487973571, 'learning_rate': 1e-05, 'num_tokens': 311457466.0, 'completions/mean_length': 6333.84375, 'completions/min_length': 564.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 6009.64501953125, 'completions/min_terminated_length': 564.0, 'completions/max_terminated_length': 16028.0, 'rewards/accuracy_reward/mean': 0.4453125, 'rewards/accuracy_reward/std': 0.4989531338214874, 'reward': 0.4453125, 'reward_std': 0.34928950667381287, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.020464638248085976, 'sampling/sampling_logp_difference/max': 3.782731533050537, 'sampling/importance_sampling_ratio/min': 0.022760435938835144, 'sampling/importance_sampling_ratio/mean': 1.0000247955322266, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9569023698568344, 'clip_ratio/low_mean': 4.789722436271404e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 1.5299877986763022e-06, 'clip_ratio/high_max': 6.119951194705209e-06, 'clip_ratio/region_mean': 4.942721272982453e-05, 'epoch': 0.35}
+
+ 37%|███▋      | 376/1024 [16:43:43<29:40:28, 164.86s/it][AINFO 12-02 16:57:16 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 16:57:16 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 16:57:16 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 16:57:16 [block_pool.py:292] Successfully reset prefix cache
+
+ 37%|███▋      | 377/1024 [16:46:25<29:27:33, 163.92s/it][A
+                                                         [A{'loss': -0.0212, 'grad_norm': 0.0021437006071209908, 'learning_rate': 1e-05, 'num_tokens': 312330879.0, 'completions/mean_length': 6657.8515625, 'completions/min_length': 594.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 6503.46875, 'completions/min_terminated_length': 594.0, 'completions/max_terminated_length': 15719.0, 'rewards/accuracy_reward/mean': 0.4453125, 'rewards/accuracy_reward/std': 0.4989531338214874, 'reward': 0.4453125, 'reward_std': 0.25354230403900146, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.021411258727312088, 'sampling/sampling_logp_difference/max': 3.9020423889160156, 'sampling/importance_sampling_ratio/min': 0.020200612023472786, 'sampling/importance_sampling_ratio/mean': 1.000024437904358, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.029910758137703, 'clip_ratio/low_mean': 4.1024483266483e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 4.531641593530367e-06, 'clip_ratio/high_max': 1.8126566374121467e-05, 'clip_ratio/region_mean': 4.5556124632639694e-05, 'epoch': 0.35}
+
+ 37%|███▋      | 377/1024 [16:46:25<29:27:33, 163.92s/it][AINFO 12-02 16:59:58 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 16:59:58 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 16:59:58 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 16:59:58 [block_pool.py:292] Successfully reset prefix cache
+
+ 37%|███▋      | 378/1024 [16:49:19<29:58:19, 167.03s/it][A
+                                                         [A{'loss': 0.066, 'grad_norm': 0.0018965511117130518, 'learning_rate': 1e-05, 'num_tokens': 313331898.0, 'completions/mean_length': 7657.8359375, 'completions/min_length': 741.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 7376.3466796875, 'completions/min_terminated_length': 741.0, 'completions/max_terminated_length': 16311.0, 'rewards/accuracy_reward/mean': 0.3515625, 'rewards/accuracy_reward/std': 0.4793342351913452, 'reward': 0.3515625, 'reward_std': 0.18884865939617157, 'frac_reward_zero_std': 0.5625, 'sampling/sampling_logp_difference/mean': 0.021029409021139145, 'sampling/sampling_logp_difference/max': 11.75278091430664, 'sampling/importance_sampling_ratio/min': 7.867415661166888e-06, 'sampling/importance_sampling_ratio/mean': 1.000010371208191, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9699486121535301, 'clip_ratio/low_mean': 2.7961265118392475e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 0.0, 'clip_ratio/high_max': 0.0, 'clip_ratio/region_mean': 2.7961265118392475e-05, 'epoch': 0.35}
+
+ 37%|███▋      | 378/1024 [16:49:19<29:58:19, 167.03s/it][AINFO 12-02 17:02:52 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 17:02:52 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 17:02:52 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 17:02:52 [block_pool.py:292] Successfully reset prefix cache
+
+ 37%|███▋      | 379/1024 [16:52:07<29:56:10, 167.09s/it][A
+                                                         [A{'loss': 0.0346, 'grad_norm': 0.0016498853219673038, 'learning_rate': 1e-05, 'num_tokens': 314258601.0, 'completions/mean_length': 7085.3671875, 'completions/min_length': 28.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 6937.77001953125, 'completions/min_terminated_length': 28.0, 'completions/max_terminated_length': 14619.0, 'rewards/accuracy_reward/mean': 0.3203125, 'rewards/accuracy_reward/std': 0.4684300124645233, 'reward': 0.3203125, 'reward_std': 0.24329257011413574, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.021414825692772865, 'sampling/sampling_logp_difference/max': 3.367583990097046, 'sampling/importance_sampling_ratio/min': 0.03447282314300537, 'sampling/importance_sampling_ratio/mean': 1.000105857849121, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.0943557620048523, 'clip_ratio/low_mean': 4.313065619498957e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 2.7168170504410227e-06, 'clip_ratio/high_max': 7.721664815107943e-06, 'clip_ratio/region_mean': 4.584747375702136e-05, 'epoch': 0.35}
+
+ 37%|███▋      | 379/1024 [16:52:07<29:56:10, 167.09s/it][AINFO 12-02 17:05:39 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 17:05:39 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 17:05:39 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 17:05:39 [block_pool.py:292] Successfully reset prefix cache
+
+ 37%|███▋      | 380/1024 [16:55:10<30:44:48, 171.88s/it][A
+                                                         [A{'loss': 0.0041, 'grad_norm': 0.001364902127534151, 'learning_rate': 1e-05, 'num_tokens': 315256840.0, 'completions/mean_length': 7614.1171875, 'completions/min_length': 511.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.046875, 'completions/mean_terminated_length': 7182.81103515625, 'completions/min_terminated_length': 511.0, 'completions/max_terminated_length': 15631.0, 'rewards/accuracy_reward/mean': 0.4296875, 'rewards/accuracy_reward/std': 0.4969765841960907, 'reward': 0.4296875, 'reward_std': 0.3503454327583313, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.02000460773706436, 'sampling/sampling_logp_difference/max': 9.585142135620117, 'sampling/importance_sampling_ratio/min': 6.874255632283166e-05, 'sampling/importance_sampling_ratio/mean': 0.999970018863678, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9673903658986092, 'clip_ratio/low_mean': 3.479703536868328e-05, 'clip_ratio/low_min': 2.6767741019284585e-06, 'clip_ratio/high_mean': 1.988372332561994e-06, 'clip_ratio/high_max': 7.953489330247976e-06, 'clip_ratio/region_mean': 3.6785407701245276e-05, 'epoch': 0.35}
+
+ 37%|███▋      | 380/1024 [16:55:10<30:44:48, 171.88s/it][AINFO 12-02 17:08:42 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 17:08:42 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 17:08:42 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 17:08:42 [block_pool.py:292] Successfully reset prefix cache
+
+ 37%|███▋      | 381/1024 [16:58:08<31:02:32, 173.80s/it][A
+                                                         [A{'loss': 0.0705, 'grad_norm': 0.002171436557546258, 'learning_rate': 1e-05, 'num_tokens': 316268976.0, 'completions/mean_length': 7766.9375, 'completions/min_length': 57.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 7630.1591796875, 'completions/min_terminated_length': 57.0, 'completions/max_terminated_length': 15637.0, 'rewards/accuracy_reward/mean': 0.34375, 'rewards/accuracy_reward/std': 0.47682511806488037, 'reward': 0.34375, 'reward_std': 0.24435341358184814, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.021251089870929718, 'sampling/sampling_logp_difference/max': 9.499998092651367, 'sampling/importance_sampling_ratio/min': 7.485197420464829e-05, 'sampling/importance_sampling_ratio/mean': 0.9999735951423645, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.0277370810508728, 'clip_ratio/low_mean': 4.410173994529032e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 1.7452061911171768e-06, 'clip_ratio/high_max': 6.980824764468707e-06, 'clip_ratio/region_mean': 4.5846945681660145e-05, 'epoch': 0.35}
+
+ 37%|███▋      | 381/1024 [16:58:08<31:02:32, 173.80s/it][AINFO 12-02 17:11:41 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 17:11:41 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 17:11:41 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 17:11:41 [block_pool.py:292] Successfully reset prefix cache
+
+ 37%|███▋      | 382/1024 [17:00:55<30:37:48, 171.76s/it][A
+                                                         [A{'loss': 0.0336, 'grad_norm': 0.0013348929351195693, 'learning_rate': 1e-05, 'num_tokens': 317285677.0, 'completions/mean_length': 7774.2265625, 'completions/min_length': 595.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 7567.59228515625, 'completions/min_terminated_length': 595.0, 'completions/max_terminated_length': 15752.0, 'rewards/accuracy_reward/mean': 0.28125, 'rewards/accuracy_reward/std': 0.4513758420944214, 'reward': 0.28125, 'reward_std': 0.23934084177017212, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.022232960909605026, 'sampling/sampling_logp_difference/max': 13.248327255249023, 'sampling/importance_sampling_ratio/min': 1.7632934259381727e-06, 'sampling/importance_sampling_ratio/mean': 0.9999904632568359, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.0064171329140663, 'clip_ratio/low_mean': 3.973216325903195e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 3.5061395919910865e-06, 'clip_ratio/high_max': 9.843256520980503e-06, 'clip_ratio/region_mean': 4.323830307839671e-05, 'epoch': 0.35}
+
+ 37%|███▋      | 382/1024 [17:00:55<30:37:48, 171.76s/it][AINFO 12-02 17:14:28 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 17:14:28 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 17:14:28 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 17:14:28 [block_pool.py:292] Successfully reset prefix cache
+
+ 37%|███▋      | 383/1024 [17:03:53<30:56:02, 173.73s/it][A
+                                                         [A{'loss': 0.0809, 'grad_norm': 0.002925361506640911, 'learning_rate': 1e-05, 'num_tokens': 318148276.0, 'completions/mean_length': 6547.1796875, 'completions/min_length': 894.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 6469.724609375, 'completions/min_terminated_length': 894.0, 'completions/max_terminated_length': 16208.0, 'rewards/accuracy_reward/mean': 0.515625, 'rewards/accuracy_reward/std': 0.5017194747924805, 'reward': 0.515625, 'reward_std': 0.24435341358184814, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.020470617339015007, 'sampling/sampling_logp_difference/max': 11.812414169311523, 'sampling/importance_sampling_ratio/min': 7.411971182591515e-06, 'sampling/importance_sampling_ratio/mean': 0.9999691843986511, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9192209765315056, 'clip_ratio/low_mean': 3.2021426648043416e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 0.0, 'clip_ratio/high_max': 0.0, 'clip_ratio/region_mean': 3.2021426648043416e-05, 'epoch': 0.35}
+
+ 37%|███▋      | 383/1024 [17:03:53<30:56:02, 173.73s/it][AINFO 12-02 17:17:26 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 17:17:26 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 17:17:26 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 17:17:26 [block_pool.py:292] Successfully reset prefix cache
+
+ 38%|███▊      | 384/1024 [17:07:05<31:49:34, 179.02s/it][A
+                                                         [A{'loss': 0.0388, 'grad_norm': 0.002288331277668476, 'learning_rate': 1e-05, 'num_tokens': 319052224.0, 'completions/mean_length': 6904.40625, 'completions/min_length': 964.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.078125, 'completions/mean_terminated_length': 6101.05078125, 'completions/min_terminated_length': 964.0, 'completions/max_terminated_length': 15923.0, 'rewards/accuracy_reward/mean': 0.390625, 'rewards/accuracy_reward/std': 0.4898075461387634, 'reward': 0.390625, 'reward_std': 0.23645779490470886, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.02049478143453598, 'sampling/sampling_logp_difference/max': 11.426142692565918, 'sampling/importance_sampling_ratio/min': 1.0906596799031831e-05, 'sampling/importance_sampling_ratio/mean': 0.9999583959579468, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9611739367246628, 'clip_ratio/low_mean': 2.0332364726982632e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 3.8596210742980475e-06, 'clip_ratio/high_max': 1.543848429719219e-05, 'clip_ratio/region_mean': 2.4191985573907004e-05, 'epoch': 0.35}
+
+ 38%|███▊      | 384/1024 [17:07:05<31:49:34, 179.02s/it][AINFO 12-02 17:20:40 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 17:20:40 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 17:20:40 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 17:20:40 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 38%|███▊      | 385/1024 [17:10:08<31:59:56, 180.28s/it][A
+                                                         [A{'loss': 0.111, 'grad_norm': 0.0015405584126710892, 'learning_rate': 1e-05, 'num_tokens': 320051534.0, 'completions/mean_length': 7632.359375, 'completions/min_length': 435.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 7350.04833984375, 'completions/min_terminated_length': 435.0, 'completions/max_terminated_length': 16195.0, 'rewards/accuracy_reward/mean': 0.3515625, 'rewards/accuracy_reward/std': 0.4793342351913452, 'reward': 0.3515625, 'reward_std': 0.30327799916267395, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.021682340651750565, 'sampling/sampling_logp_difference/max': 8.810256004333496, 'sampling/importance_sampling_ratio/min': 0.00014919505338184536, 'sampling/importance_sampling_ratio/mean': 1.0000008344650269, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.0255606770515442, 'clip_ratio/low_mean': 3.791802066643868e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 3.662984454422258e-06, 'clip_ratio/high_max': 1.0430391284899088e-05, 'clip_ratio/region_mean': 4.158100534823461e-05, 'epoch': 0.35}
+
+ 38%|███▊      | 385/1024 [17:10:08<31:59:56, 180.28s/it][AINFO 12-02 17:23:41 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 17:23:41 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 17:23:41 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 17:23:41 [block_pool.py:292] Successfully reset prefix cache
+
+ 38%|███▊      | 386/1024 [17:12:37<30:18:47, 171.05s/it][A
+                                                         [A{'loss': 0.0707, 'grad_norm': 0.00142462900839746, 'learning_rate': 1e-05, 'num_tokens': 320872143.0, 'completions/mean_length': 6260.8828125, 'completions/min_length': 1371.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 6100.19873046875, 'completions/min_terminated_length': 1371.0, 'completions/max_terminated_length': 14348.0, 'rewards/accuracy_reward/mean': 0.53125, 'rewards/accuracy_reward/std': 0.5009832978248596, 'reward': 0.53125, 'reward_std': 0.18990948796272278, 'frac_reward_zero_std': 0.5625, 'sampling/sampling_logp_difference/mean': 0.018331468105316162, 'sampling/sampling_logp_difference/max': 7.9686665534973145, 'sampling/importance_sampling_ratio/min': 0.0003461402375251055, 'sampling/importance_sampling_ratio/mean': 0.9999794960021973, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.7945073395967484, 'clip_ratio/low_mean': 2.2116193804322393e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 2.752025466179475e-06, 'clip_ratio/high_max': 1.10081018647179e-05, 'clip_ratio/region_mean': 2.4868219043128192e-05, 'epoch': 0.36}
+
+ 38%|███▊      | 386/1024 [17:12:37<30:18:47, 171.05s/it][AINFO 12-02 17:26:10 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 17:26:10 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 17:26:10 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 17:26:10 [block_pool.py:292] Successfully reset prefix cache
+
+ 38%|███▊      | 387/1024 [17:15:23<29:58:22, 169.39s/it][A
+                                                         [A{'loss': 0.0527, 'grad_norm': 0.0019111793953925371, 'learning_rate': 1e-05, 'num_tokens': 321885447.0, 'completions/mean_length': 7761.375, 'completions/min_length': 765.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 7693.48046875, 'completions/min_terminated_length': 765.0, 'completions/max_terminated_length': 15807.0, 'rewards/accuracy_reward/mean': 0.390625, 'rewards/accuracy_reward/std': 0.4898075461387634, 'reward': 0.390625, 'reward_std': 0.2806519567966461, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.02205459028482437, 'sampling/sampling_logp_difference/max': 8.169577598571777, 'sampling/importance_sampling_ratio/min': 0.00028313760412856936, 'sampling/importance_sampling_ratio/mean': 0.9999994039535522, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.0799954682588577, 'clip_ratio/low_mean': 2.7251681331108557e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 2.2380747850547777e-06, 'clip_ratio/high_max': 8.952299140219111e-06, 'clip_ratio/region_mean': 2.9489756570910686e-05, 'epoch': 0.36}
+
+ 38%|███▊      | 387/1024 [17:15:23<29:58:22, 169.39s/it][AINFO 12-02 17:28:56 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 17:28:56 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 17:28:56 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 17:28:56 [block_pool.py:292] Successfully reset prefix cache
+
+ 38%|███▊      | 388/1024 [17:18:12<29:54:12, 169.26s/it][A
+                                                         [A{'loss': 0.0502, 'grad_norm': 0.001482969499193132, 'learning_rate': 1e-05, 'num_tokens': 322838797.0, 'completions/mean_length': 7294.796875, 'completions/min_length': 662.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 7076.65625, 'completions/min_terminated_length': 662.0, 'completions/max_terminated_length': 15808.0, 'rewards/accuracy_reward/mean': 0.5234375, 'rewards/accuracy_reward/std': 0.5014128684997559, 'reward': 0.5234375, 'reward_std': 0.36007601022720337, 'frac_reward_zero_std': 0.1875, 'sampling/sampling_logp_difference/mean': 0.01850103959441185, 'sampling/sampling_logp_difference/max': 14.669804573059082, 'sampling/importance_sampling_ratio/min': 4.2558355062283226e-07, 'sampling/importance_sampling_ratio/mean': 0.9999374151229858, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8159547671675682, 'clip_ratio/low_mean': 4.549925756691664e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 5.09954668359569e-06, 'clip_ratio/high_max': 1.6241773209912935e-05, 'clip_ratio/region_mean': 5.0598803454704466e-05, 'epoch': 0.36}
+
+ 38%|███▊      | 388/1024 [17:18:12<29:54:12, 169.26s/it][AINFO 12-02 17:31:45 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 17:31:45 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 17:31:45 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 17:31:45 [block_pool.py:292] Successfully reset prefix cache
+
+ 38%|███▊      | 389/1024 [17:21:00<29:47:17, 168.88s/it][A
+                                                         [A{'loss': 0.075, 'grad_norm': 0.001755179837346077, 'learning_rate': 1e-05, 'num_tokens': 323782333.0, 'completions/mean_length': 7229.875, 'completions/min_length': 579.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.046875, 'completions/mean_terminated_length': 6779.671875, 'completions/min_terminated_length': 579.0, 'completions/max_terminated_length': 15897.0, 'rewards/accuracy_reward/mean': 0.3984375, 'rewards/accuracy_reward/std': 0.4915000796318054, 'reward': 0.3984375, 'reward_std': 0.24541424214839935, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.02008935809135437, 'sampling/sampling_logp_difference/max': 9.50635051727295, 'sampling/importance_sampling_ratio/min': 7.437798922182992e-05, 'sampling/importance_sampling_ratio/mean': 0.9999120831489563, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9420096501708031, 'clip_ratio/low_mean': 2.875013205994037e-05, 'clip_ratio/low_min': 3.824852228717646e-06, 'clip_ratio/high_mean': 1.7856882550404407e-06, 'clip_ratio/high_max': 4.2527130972302984e-06, 'clip_ratio/region_mean': 3.053582031498081e-05, 'epoch': 0.36}
+
+ 38%|███▊      | 389/1024 [17:21:00<29:47:17, 168.88s/it][AINFO 12-02 17:34:33 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 17:34:33 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 17:34:33 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 17:34:33 [block_pool.py:292] Successfully reset prefix cache
+
+ 38%|███▊      | 390/1024 [17:23:46<29:37:14, 168.19s/it][A
+                                                         [A{'loss': 0.0549, 'grad_norm': 0.00202886201441288, 'learning_rate': 1e-05, 'num_tokens': 324648848.0, 'completions/mean_length': 6628.8359375, 'completions/min_length': 851.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 6473.99267578125, 'completions/min_terminated_length': 851.0, 'completions/max_terminated_length': 16016.0, 'rewards/accuracy_reward/mean': 0.421875, 'rewards/accuracy_reward/std': 0.4957992732524872, 'reward': 0.421875, 'reward_std': 0.15650184452533722, 'frac_reward_zero_std': 0.625, 'sampling/sampling_logp_difference/mean': 0.021120186895132065, 'sampling/sampling_logp_difference/max': 3.5622735023498535, 'sampling/importance_sampling_ratio/min': 0.028374243527650833, 'sampling/importance_sampling_ratio/mean': 0.9999722242355347, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.0327190533280373, 'clip_ratio/low_mean': 2.0493020770118164e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 2.202648829552345e-06, 'clip_ratio/high_max': 8.81059531820938e-06, 'clip_ratio/region_mean': 2.269566959967051e-05, 'epoch': 0.36}
+
+ 38%|███▊      | 390/1024 [17:23:46<29:37:14, 168.19s/it][AINFO 12-02 17:37:19 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 17:37:19 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 17:37:19 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 17:37:19 [block_pool.py:292] Successfully reset prefix cache
+
+ 38%|███▊      | 391/1024 [17:26:28<29:14:49, 166.33s/it][A
+                                                         [A{'loss': 0.0271, 'grad_norm': 0.0026126320008188486, 'learning_rate': 1e-05, 'num_tokens': 325617965.0, 'completions/mean_length': 7402.4140625, 'completions/min_length': 367.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 7259.849609375, 'completions/min_terminated_length': 367.0, 'completions/max_terminated_length': 15405.0, 'rewards/accuracy_reward/mean': 0.328125, 'rewards/accuracy_reward/std': 0.4713755249977112, 'reward': 0.328125, 'reward_std': 0.21436560153961182, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.021358007565140724, 'sampling/sampling_logp_difference/max': 6.191333770751953, 'sampling/importance_sampling_ratio/min': 0.002047094516456127, 'sampling/importance_sampling_ratio/mean': 1.0000274181365967, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.0335597470402718, 'clip_ratio/low_mean': 5.5018343005031056e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 0.0, 'clip_ratio/high_max': 0.0, 'clip_ratio/region_mean': 5.5018343005031056e-05, 'epoch': 0.36}
+
+ 38%|███▊      | 391/1024 [17:26:28<29:14:49, 166.33s/it][AINFO 12-02 17:40:01 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 17:40:01 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 17:40:01 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 17:40:01 [block_pool.py:292] Successfully reset prefix cache
+
+ 38%|███▊      | 392/1024 [17:29:29<29:57:44, 170.67s/it][A
+                                                         [A{'loss': 0.0558, 'grad_norm': 0.004497586749494076, 'learning_rate': 1e-05, 'num_tokens': 326583819.0, 'completions/mean_length': 7414.046875, 'completions/min_length': 467.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 7124.693359375, 'completions/min_terminated_length': 467.0, 'completions/max_terminated_length': 16246.0, 'rewards/accuracy_reward/mean': 0.4140625, 'rewards/accuracy_reward/std': 0.49449479579925537, 'reward': 0.4140625, 'reward_std': 0.29538238048553467, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.021614551544189453, 'sampling/sampling_logp_difference/max': 6.500093460083008, 'sampling/importance_sampling_ratio/min': 0.0015032986411824822, 'sampling/importance_sampling_ratio/mean': 0.9999969005584717, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.043906107544899, 'clip_ratio/low_mean': 4.4972417526878417e-05, 'clip_ratio/low_min': 8.263916242867708e-06, 'clip_ratio/high_mean': 5.139017389410583e-06, 'clip_ratio/high_max': 1.7713674878905294e-05, 'clip_ratio/region_mean': 5.0111435712096863e-05, 'epoch': 0.36}
+
+ 38%|███▊      | 392/1024 [17:29:29<29:57:44, 170.67s/it][AINFO 12-02 17:43:02 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 17:43:02 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 17:43:02 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 17:43:02 [block_pool.py:292] Successfully reset prefix cache
+
+ 38%|███▊      | 393/1024 [17:32:00<28:53:44, 164.86s/it][A
+                                                         [A{'loss': 0.0468, 'grad_norm': 0.0026554155629128218, 'learning_rate': 1e-05, 'num_tokens': 327512315.0, 'completions/mean_length': 7090.5, 'completions/min_length': 1183.0, 'completions/max_length': 14288.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 7090.5, 'completions/min_terminated_length': 1183.0, 'completions/max_terminated_length': 14288.0, 'rewards/accuracy_reward/mean': 0.53125, 'rewards/accuracy_reward/std': 0.5009832978248596, 'reward': 0.53125, 'reward_std': 0.27722427248954773, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.020926889032125473, 'sampling/sampling_logp_difference/max': 9.552186012268066, 'sampling/importance_sampling_ratio/min': 7.104578980943188e-05, 'sampling/importance_sampling_ratio/mean': 0.999885618686676, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9755794927477837, 'clip_ratio/low_mean': 2.0601042535872693e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 5.603103545581689e-06, 'clip_ratio/high_max': 2.2412414182326756e-05, 'clip_ratio/region_mean': 2.620414619514122e-05, 'epoch': 0.36}
+
+ 38%|███▊      | 393/1024 [17:32:00<28:53:44, 164.86s/it][AINFO 12-02 17:45:33 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 17:45:33 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 17:45:33 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 17:45:33 [block_pool.py:292] Successfully reset prefix cache
+
+ 38%|███▊      | 394/1024 [17:34:39<28:32:34, 163.10s/it][A
+                                                         [A{'loss': -0.0024, 'grad_norm': 0.0018455780809745193, 'learning_rate': 1e-05, 'num_tokens': 328454269.0, 'completions/mean_length': 7200.140625, 'completions/min_length': 80.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 7127.82666015625, 'completions/min_terminated_length': 80.0, 'completions/max_terminated_length': 15710.0, 'rewards/accuracy_reward/mean': 0.3828125, 'rewards/accuracy_reward/std': 0.4879830479621887, 'reward': 0.3828125, 'reward_std': 0.2301519364118576, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.01939154416322708, 'sampling/sampling_logp_difference/max': 7.989675045013428, 'sampling/importance_sampling_ratio/min': 0.00033894419902935624, 'sampling/importance_sampling_ratio/mean': 0.9999072551727295, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9084664657711983, 'clip_ratio/low_mean': 3.035687961983058e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 1.5600960523443064e-06, 'clip_ratio/high_max': 3.259367531427415e-06, 'clip_ratio/region_mean': 3.191697578586172e-05, 'epoch': 0.36}
+
+ 38%|███▊      | 394/1024 [17:34:39<28:32:34, 163.10s/it][AINFO 12-02 17:48:12 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 17:48:12 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 17:48:12 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 17:48:12 [block_pool.py:292] Successfully reset prefix cache
+
+ 39%|███▊      | 395/1024 [17:37:28<28:45:49, 164.63s/it][A
+                                                         [A{'loss': 0.06, 'grad_norm': 0.0021165197249501944, 'learning_rate': 1e-05, 'num_tokens': 329366400.0, 'completions/mean_length': 6977.5234375, 'completions/min_length': 193.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 6674.08837890625, 'completions/min_terminated_length': 193.0, 'completions/max_terminated_length': 16296.0, 'rewards/accuracy_reward/mean': 0.4453125, 'rewards/accuracy_reward/std': 0.4989531338214874, 'reward': 0.4453125, 'reward_std': 0.26485776901245117, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.020020857453346252, 'sampling/sampling_logp_difference/max': 3.3792455196380615, 'sampling/importance_sampling_ratio/min': 0.034073151648044586, 'sampling/importance_sampling_ratio/mean': 1.0000401735305786, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9352559298276901, 'clip_ratio/low_mean': 3.0616293088314706e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 2.565165686974069e-06, 'clip_ratio/high_max': 1.0260662747896276e-05, 'clip_ratio/region_mean': 3.318145900266245e-05, 'epoch': 0.36}
+
+ 39%|███▊      | 395/1024 [17:37:28<28:45:49, 164.63s/it][AINFO 12-02 17:51:00 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 17:51:00 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 17:51:00 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 17:51:00 [block_pool.py:292] Successfully reset prefix cache
+
+ 39%|███▊      | 396/1024 [17:40:41<30:12:00, 173.12s/it][A
+                                                         [A{'loss': 0.1424, 'grad_norm': 0.0022391832899302244, 'learning_rate': 1e-05, 'num_tokens': 330386442.0, 'completions/mean_length': 7819.828125, 'completions/min_length': 1273.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.046875, 'completions/mean_terminated_length': 7398.63916015625, 'completions/min_terminated_length': 1273.0, 'completions/max_terminated_length': 16230.0, 'rewards/accuracy_reward/mean': 0.453125, 'rewards/accuracy_reward/std': 0.4997538626194, 'reward': 0.453125, 'reward_std': 0.29302334785461426, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.021083837375044823, 'sampling/sampling_logp_difference/max': 8.142753601074219, 'sampling/importance_sampling_ratio/min': 0.0002908352471422404, 'sampling/importance_sampling_ratio/mean': 0.9999501705169678, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.0045175030827522, 'clip_ratio/low_mean': 4.1006693436429487e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 6.9430195139830175e-06, 'clip_ratio/high_max': 2.777207805593207e-05, 'clip_ratio/region_mean': 4.794971300725592e-05, 'epoch': 0.36}
+
+ 39%|███▊      | 396/1024 [17:40:41<30:12:00, 173.12s/it][AINFO 12-02 17:54:13 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 17:54:13 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 17:54:13 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 17:54:13 [block_pool.py:292] Successfully reset prefix cache
+
+ 39%|███▉      | 397/1024 [17:43:48<30:53:34, 177.38s/it][A
+                                                         [A{'loss': 0.0503, 'grad_norm': 0.002260354580357671, 'learning_rate': 1e-05, 'num_tokens': 331286181.0, 'completions/mean_length': 6872.0859375, 'completions/min_length': 609.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0390625, 'completions/mean_terminated_length': 6485.42236328125, 'completions/min_terminated_length': 609.0, 'completions/max_terminated_length': 16135.0, 'rewards/accuracy_reward/mean': 0.4921875, 'rewards/accuracy_reward/std': 0.5019033551216125, 'reward': 0.4921875, 'reward_std': 0.2280302792787552, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.019428331404924393, 'sampling/sampling_logp_difference/max': 8.185760498046875, 'sampling/importance_sampling_ratio/min': 0.0002785924880299717, 'sampling/importance_sampling_ratio/mean': 0.9999260902404785, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8501477539539337, 'clip_ratio/low_mean': 3.623322004386864e-05, 'clip_ratio/low_min': 5.5314631026703864e-06, 'clip_ratio/high_mean': 2.0105512703594286e-06, 'clip_ratio/high_max': 8.042205081437714e-06, 'clip_ratio/region_mean': 3.8243771086854395e-05, 'epoch': 0.37}
+
+ 39%|███▉      | 397/1024 [17:43:48<30:53:34, 177.38s/it][AINFO 12-02 17:57:21 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 17:57:21 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 17:57:21 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 17:57:21 [block_pool.py:292] Successfully reset prefix cache
+
+ 39%|███▉      | 398/1024 [17:46:31<30:06:19, 173.13s/it][A
+                                                         [A{'loss': 0.0525, 'grad_norm': 0.001649077981710434, 'learning_rate': 1e-05, 'num_tokens': 332166003.0, 'completions/mean_length': 6724.546875, 'completions/min_length': 588.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 6571.22265625, 'completions/min_terminated_length': 588.0, 'completions/max_terminated_length': 16187.0, 'rewards/accuracy_reward/mean': 0.421875, 'rewards/accuracy_reward/std': 0.4957992732524872, 'reward': 0.421875, 'reward_std': 0.21542152762413025, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.020595930516719818, 'sampling/sampling_logp_difference/max': 12.493709564208984, 'sampling/importance_sampling_ratio/min': 3.7501690712815616e-06, 'sampling/importance_sampling_ratio/mean': 0.9999189376831055, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.0110125690698624, 'clip_ratio/low_mean': 2.8814496317863814e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 8.01508122094674e-07, 'clip_ratio/high_max': 3.206032488378696e-06, 'clip_ratio/region_mean': 2.9616004439958488e-05, 'epoch': 0.37}
+
+ 39%|███▉      | 398/1024 [17:46:31<30:06:19, 173.13s/it][AINFO 12-02 18:00:04 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 18:00:04 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 18:00:04 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 18:00:04 [block_pool.py:292] Successfully reset prefix cache
+[OpenTinker] 2025-12-02 18:02:06,490 - math_verify.grader - WARNING - Timeout during comparison
+[OpenTinker] 2025-12-02 18:02:11,585 - math_verify.grader - WARNING - Timeout during comparison
+[OpenTinker] 2025-12-02 18:02:16,594 - math_verify.grader - WARNING - Timeout during comparison
+
+ 39%|███▉      | 399/1024 [17:49:51<31:26:49, 181.13s/it][A
+                                                         [A{'loss': 0.084, 'grad_norm': 0.0010421582264825702, 'learning_rate': 1e-05, 'num_tokens': 333188785.0, 'completions/mean_length': 7833.546875, 'completions/min_length': 1509.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0390625, 'completions/mean_terminated_length': 7485.96728515625, 'completions/min_terminated_length': 1509.0, 'completions/max_terminated_length': 16096.0, 'rewards/accuracy_reward/mean': 0.328125, 'rewards/accuracy_reward/std': 0.4713755249977112, 'reward': 0.328125, 'reward_std': 0.22567126154899597, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.018777694553136826, 'sampling/sampling_logp_difference/max': 7.110662937164307, 'sampling/importance_sampling_ratio/min': 0.0008163535967469215, 'sampling/importance_sampling_ratio/mean': 0.9999600648880005, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8942571505904198, 'clip_ratio/low_mean': 3.4109823332073574e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 2.787299877127225e-06, 'clip_ratio/high_max': 1.11491995085089e-05, 'clip_ratio/region_mean': 3.689712332288764e-05, 'epoch': 0.37}
+
+ 39%|███▉      | 399/1024 [17:49:51<31:26:49, 181.13s/it][AINFO 12-02 18:03:24 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 18:03:24 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 18:03:24 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 18:03:24 [block_pool.py:292] Successfully reset prefix cache
+
+ 39%|███▉      | 400/1024 [17:52:38<30:39:18, 176.86s/it][A
+                                                         [A{'loss': 0.0284, 'grad_norm': 0.002221160801127553, 'learning_rate': 1e-05, 'num_tokens': 334128989.0, 'completions/mean_length': 7194.96875, 'completions/min_length': 183.0, 'completions/max_length': 15624.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 7194.96875, 'completions/min_terminated_length': 183.0, 'completions/max_terminated_length': 15624.0, 'rewards/accuracy_reward/mean': 0.3671875, 'rewards/accuracy_reward/std': 0.4839322865009308, 'reward': 0.3671875, 'reward_std': 0.26826781034469604, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.021122492849826813, 'sampling/sampling_logp_difference/max': 11.729671478271484, 'sampling/importance_sampling_ratio/min': 8.05134459369583e-06, 'sampling/importance_sampling_ratio/mean': 0.999954879283905, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.0446517765522003, 'clip_ratio/low_mean': 3.350823226355715e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 2.525491879623587e-06, 'clip_ratio/high_max': 1.0101967518494348e-05, 'clip_ratio/region_mean': 3.603372420002415e-05, 'epoch': 0.37}
+
+ 39%|███▉      | 400/1024 [17:52:38<30:39:18, 176.86s/it][AINFO 12-02 18:06:11 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 18:06:11 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 18:06:11 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 18:06:11 [block_pool.py:292] Successfully reset prefix cache
+
+ 39%|███▉      | 401/1024 [17:55:24<30:02:02, 173.55s/it][A
+                                                         [A{'loss': 0.0467, 'grad_norm': 0.0030101474840193987, 'learning_rate': 1e-05, 'num_tokens': 335047917.0, 'completions/mean_length': 7037.875, 'completions/min_length': 810.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0390625, 'completions/mean_terminated_length': 6657.951171875, 'completions/min_terminated_length': 810.0, 'completions/max_terminated_length': 14241.0, 'rewards/accuracy_reward/mean': 0.4375, 'rewards/accuracy_reward/std': 0.49802759289741516, 'reward': 0.4375, 'reward_std': 0.2398776262998581, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.02049148827791214, 'sampling/sampling_logp_difference/max': 10.622724533081055, 'sampling/importance_sampling_ratio/min': 2.435619171592407e-05, 'sampling/importance_sampling_ratio/mean': 0.9999676942825317, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9549769386649132, 'clip_ratio/low_mean': 2.5873220806715835e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 1.4975081512602628e-06, 'clip_ratio/high_max': 5.990032605041051e-06, 'clip_ratio/region_mean': 2.737072884428926e-05, 'epoch': 0.37}
+
+ 39%|███▉      | 401/1024 [17:55:24<30:02:02, 173.55s/it][AINFO 12-02 18:08:56 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 18:08:56 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 18:08:56 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 18:08:56 [block_pool.py:292] Successfully reset prefix cache
+
+ 39%|███▉      | 402/1024 [17:58:26<30:26:54, 176.23s/it][A
+                                                         [A{'loss': 0.0627, 'grad_norm': 0.0016104152891784906, 'learning_rate': 1e-05, 'num_tokens': 336042178.0, 'completions/mean_length': 7596.7890625, 'completions/min_length': 285.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 7313.33056640625, 'completions/min_terminated_length': 285.0, 'completions/max_terminated_length': 16100.0, 'rewards/accuracy_reward/mean': 0.359375, 'rewards/accuracy_reward/std': 0.481702595949173, 'reward': 0.359375, 'reward_std': 0.27722427248954773, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.019274067133665085, 'sampling/sampling_logp_difference/max': 4.869933128356934, 'sampling/importance_sampling_ratio/min': 0.007673877757042646, 'sampling/importance_sampling_ratio/mean': 0.999981164932251, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8307650238275528, 'clip_ratio/low_mean': 3.1645918625144986e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 2.020656438617152e-06, 'clip_ratio/high_max': 8.082625754468609e-06, 'clip_ratio/region_mean': 3.366657551850949e-05, 'epoch': 0.37}
+
+ 39%|███▉      | 402/1024 [17:58:26<30:26:54, 176.23s/it][AINFO 12-02 18:11:59 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 18:11:59 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 18:11:59 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 18:11:59 [block_pool.py:292] Successfully reset prefix cache
+
+ 39%|███▉      | 403/1024 [18:01:28<30:41:21, 177.91s/it][A
+                                                         [A{'loss': 0.0452, 'grad_norm': 0.0010739013087004423, 'learning_rate': 1e-05, 'num_tokens': 336963318.0, 'completions/mean_length': 7060.34375, 'completions/min_length': 897.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 6836.576171875, 'completions/min_terminated_length': 897.0, 'completions/max_terminated_length': 16134.0, 'rewards/accuracy_reward/mean': 0.328125, 'rewards/accuracy_reward/std': 0.4713755249977112, 'reward': 0.328125, 'reward_std': 0.1733490228652954, 'frac_reward_zero_std': 0.625, 'sampling/sampling_logp_difference/mean': 0.02168721705675125, 'sampling/sampling_logp_difference/max': 9.069646835327148, 'sampling/importance_sampling_ratio/min': 0.00011510718468343839, 'sampling/importance_sampling_ratio/mean': 1.000027060508728, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.0481776595115662, 'clip_ratio/low_mean': 1.32123756202418e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 1.4120464584266301e-06, 'clip_ratio/high_max': 5.6481858337065205e-06, 'clip_ratio/region_mean': 1.4624422078668431e-05, 'epoch': 0.37}
+
+ 39%|███▉      | 403/1024 [18:01:28<30:41:21, 177.91s/it][AINFO 12-02 18:15:01 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 18:15:01 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 18:15:01 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 18:15:01 [block_pool.py:292] Successfully reset prefix cache
+
+ 39%|███▉      | 404/1024 [18:04:20<30:20:22, 176.16s/it][A
+                                                         [A{'loss': 0.0255, 'grad_norm': 0.0016449482645839453, 'learning_rate': 1e-05, 'num_tokens': 337972068.0, 'completions/mean_length': 7746.484375, 'completions/min_length': 960.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 7609.38134765625, 'completions/min_terminated_length': 960.0, 'completions/max_terminated_length': 16357.0, 'rewards/accuracy_reward/mean': 0.2421875, 'rewards/accuracy_reward/std': 0.4300905168056488, 'reward': 0.2421875, 'reward_std': 0.1830747127532959, 'frac_reward_zero_std': 0.5625, 'sampling/sampling_logp_difference/mean': 0.021722178906202316, 'sampling/sampling_logp_difference/max': 7.34059476852417, 'sampling/importance_sampling_ratio/min': 0.0006486645434051752, 'sampling/importance_sampling_ratio/mean': 0.9999643564224243, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.0216905921697617, 'clip_ratio/low_mean': 1.9099150676993304e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 8.000512252692715e-07, 'clip_ratio/high_max': 3.200204901077086e-06, 'clip_ratio/region_mean': 1.9899201902262575e-05, 'epoch': 0.37}
+
+ 39%|███▉      | 404/1024 [18:04:20<30:20:22, 176.16s/it][AINFO 12-02 18:17:53 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 18:17:53 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 18:17:53 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 18:17:53 [block_pool.py:292] Successfully reset prefix cache
+
+ 40%|███▉      | 405/1024 [18:07:05<29:44:24, 172.96s/it][A
+                                                         [A{'loss': 0.0358, 'grad_norm': 0.003412841120734811, 'learning_rate': 1e-05, 'num_tokens': 338876663.0, 'completions/mean_length': 6913.3984375, 'completions/min_length': 843.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 6686.1044921875, 'completions/min_terminated_length': 843.0, 'completions/max_terminated_length': 15784.0, 'rewards/accuracy_reward/mean': 0.46875, 'rewards/accuracy_reward/std': 0.5009832978248596, 'reward': 0.46875, 'reward_std': 0.33797895908355713, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.02173588052392006, 'sampling/sampling_logp_difference/max': 8.98972225189209, 'sampling/importance_sampling_ratio/min': 0.00012468472414184362, 'sampling/importance_sampling_ratio/mean': 0.9999386668205261, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9993953481316566, 'clip_ratio/low_mean': 4.580058657666086e-05, 'clip_ratio/low_min': 4.674994215747574e-06, 'clip_ratio/high_mean': 5.404108605944202e-06, 'clip_ratio/high_max': 2.161643442377681e-05, 'clip_ratio/region_mean': 5.120469540997874e-05, 'epoch': 0.37}
+
+ 40%|███▉      | 405/1024 [18:07:05<29:44:24, 172.96s/it][AINFO 12-02 18:20:38 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 18:20:38 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 18:20:38 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 18:20:38 [block_pool.py:292] Successfully reset prefix cache
+
+ 40%|███▉      | 406/1024 [18:10:03<29:55:18, 174.30s/it][A
+                                                         [A{'loss': -0.0047, 'grad_norm': 0.0015972270630300045, 'learning_rate': 1e-05, 'num_tokens': 339871184.0, 'completions/mean_length': 7588.6953125, 'completions/min_length': 491.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 7377.6083984375, 'completions/min_terminated_length': 491.0, 'completions/max_terminated_length': 16192.0, 'rewards/accuracy_reward/mean': 0.28125, 'rewards/accuracy_reward/std': 0.4513758420944214, 'reward': 0.28125, 'reward_std': 0.1820138692855835, 'frac_reward_zero_std': 0.5625, 'sampling/sampling_logp_difference/mean': 0.022462764754891396, 'sampling/sampling_logp_difference/max': 8.749972343444824, 'sampling/importance_sampling_ratio/min': 0.00015846571477595717, 'sampling/importance_sampling_ratio/mean': 0.9999430775642395, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.1119055226445198, 'clip_ratio/low_mean': 2.64205210100954e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 3.5224193766225653e-06, 'clip_ratio/high_max': 1.074430110747926e-05, 'clip_ratio/region_mean': 2.9942940273031127e-05, 'epoch': 0.37}
+
+ 40%|███▉      | 406/1024 [18:10:03<29:55:18, 174.30s/it][AINFO 12-02 18:23:36 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 18:23:36 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 18:23:36 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 18:23:36 [block_pool.py:292] Successfully reset prefix cache
+
+ 40%|███▉      | 407/1024 [18:12:41<29:03:24, 169.54s/it][A
+                                                         [A{'loss': 0.0607, 'grad_norm': 0.002284019021317363, 'learning_rate': 1e-05, 'num_tokens': 340725769.0, 'completions/mean_length': 6522.4453125, 'completions/min_length': 872.0, 'completions/max_length': 16331.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 6522.4453125, 'completions/min_terminated_length': 872.0, 'completions/max_terminated_length': 16331.0, 'rewards/accuracy_reward/mean': 0.515625, 'rewards/accuracy_reward/std': 0.5017194747924805, 'reward': 0.515625, 'reward_std': 0.28749164938926697, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.02157575450837612, 'sampling/sampling_logp_difference/max': 7.022393703460693, 'sampling/importance_sampling_ratio/min': 0.0008916885708458722, 'sampling/importance_sampling_ratio/mean': 0.9998612999916077, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.0155515000224113, 'clip_ratio/low_mean': 4.525409747202502e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 3.111276271283714e-06, 'clip_ratio/high_max': 1.2445105085134855e-05, 'clip_ratio/region_mean': 4.836537357277848e-05, 'epoch': 0.37}
+
+ 40%|███▉      | 407/1024 [18:12:41<29:03:24, 169.54s/it][AINFO 12-02 18:26:14 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 18:26:14 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 18:26:14 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 18:26:14 [block_pool.py:292] Successfully reset prefix cache
+
+ 40%|███▉      | 408/1024 [18:15:25<28:42:14, 167.75s/it][A
+                                                         [A{'loss': 0.0622, 'grad_norm': 0.0010391590185463428, 'learning_rate': 1e-05, 'num_tokens': 341610881.0, 'completions/mean_length': 6749.125, 'completions/min_length': 1156.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 6517.88818359375, 'completions/min_terminated_length': 1156.0, 'completions/max_terminated_length': 13961.0, 'rewards/accuracy_reward/mean': 0.3828125, 'rewards/accuracy_reward/std': 0.4879830479621887, 'reward': 0.3828125, 'reward_std': 0.1990984082221985, 'frac_reward_zero_std': 0.5625, 'sampling/sampling_logp_difference/mean': 0.02200891077518463, 'sampling/sampling_logp_difference/max': 8.47312068939209, 'sampling/importance_sampling_ratio/min': 0.00020901163225062191, 'sampling/importance_sampling_ratio/mean': 0.9999426007270813, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.0721680670976639, 'clip_ratio/low_mean': 2.7839718427458138e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 3.614079844282969e-06, 'clip_ratio/high_max': 1.4456319377131877e-05, 'clip_ratio/region_mean': 3.1453798442271363e-05, 'epoch': 0.38}
+
+ 40%|███▉      | 408/1024 [18:15:25<28:42:14, 167.75s/it][AINFO 12-02 18:28:58 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 18:28:58 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 18:28:58 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 18:28:58 [block_pool.py:292] Successfully reset prefix cache
+
+ 40%|███▉      | 409/1024 [18:18:28<29:27:03, 172.40s/it][A
+                                                         [A{'loss': 0.0168, 'grad_norm': 0.0022965834941715, 'learning_rate': 1e-05, 'num_tokens': 342652897.0, 'completions/mean_length': 7988.4375, 'completions/min_length': 923.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0390625, 'completions/mean_terminated_length': 7647.154296875, 'completions/min_terminated_length': 923.0, 'completions/max_terminated_length': 16374.0, 'rewards/accuracy_reward/mean': 0.328125, 'rewards/accuracy_reward/std': 0.4713755249977112, 'reward': 0.328125, 'reward_std': 0.2459382861852646, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.020454837009310722, 'sampling/sampling_logp_difference/max': 7.855283260345459, 'sampling/importance_sampling_ratio/min': 0.0003876982373185456, 'sampling/importance_sampling_ratio/mean': 0.9999749660491943, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9933496564626694, 'clip_ratio/low_mean': 1.4307706237559614e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 0.0, 'clip_ratio/high_max': 0.0, 'clip_ratio/region_mean': 1.4307706237559614e-05, 'epoch': 0.38}
+
+ 40%|███▉      | 409/1024 [18:18:28<29:27:03, 172.40s/it][AINFO 12-02 18:32:01 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 18:32:01 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 18:32:01 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 18:32:01 [block_pool.py:292] Successfully reset prefix cache
+
+ 40%|████      | 410/1024 [18:21:19<29:19:17, 171.92s/it][A
+                                                         [A{'loss': 0.0501, 'grad_norm': 0.0027879721019417048, 'learning_rate': 1e-05, 'num_tokens': 343578670.0, 'completions/mean_length': 7072.4140625, 'completions/min_length': 442.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 6848.9365234375, 'completions/min_terminated_length': 442.0, 'completions/max_terminated_length': 15968.0, 'rewards/accuracy_reward/mean': 0.5234375, 'rewards/accuracy_reward/std': 0.5014128684997559, 'reward': 0.5234375, 'reward_std': 0.3043339252471924, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.021169768646359444, 'sampling/sampling_logp_difference/max': 9.043949127197266, 'sampling/importance_sampling_ratio/min': 0.0001181035113404505, 'sampling/importance_sampling_ratio/mean': 1.0000234842300415, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9560660421848297, 'clip_ratio/low_mean': 3.983285796493874e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 1.6455742866128276e-06, 'clip_ratio/high_max': 6.58229714645131e-06, 'clip_ratio/region_mean': 4.14784317399608e-05, 'epoch': 0.38}
+
+ 40%|████      | 410/1024 [18:21:19<29:19:17, 171.92s/it][AINFO 12-02 18:34:52 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 18:34:52 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 18:34:52 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 18:34:52 [block_pool.py:292] Successfully reset prefix cache
+
+ 40%|████      | 411/1024 [18:24:21<29:48:28, 175.05s/it][A
+                                                         [A{'loss': 0.0457, 'grad_norm': 0.0016929456032812595, 'learning_rate': 1e-05, 'num_tokens': 344441080.0, 'completions/mean_length': 6586.515625, 'completions/min_length': 613.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 6270.4677734375, 'completions/min_terminated_length': 613.0, 'completions/max_terminated_length': 15768.0, 'rewards/accuracy_reward/mean': 0.4765625, 'rewards/accuracy_reward/std': 0.5014128684997559, 'reward': 0.4765625, 'reward_std': 0.20175683498382568, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.019590143114328384, 'sampling/sampling_logp_difference/max': 4.276163101196289, 'sampling/importance_sampling_ratio/min': 0.013895876705646515, 'sampling/importance_sampling_ratio/mean': 0.9999939799308777, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.893077902495861, 'clip_ratio/low_mean': 2.449715702823596e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 4.784176326211309e-06, 'clip_ratio/high_max': 1.9136705304845236e-05, 'clip_ratio/region_mean': 2.928133335444727e-05, 'epoch': 0.38}
+
+ 40%|████      | 411/1024 [18:24:21<29:48:28, 175.05s/it][AINFO 12-02 18:37:54 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 18:37:54 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 18:37:54 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 18:37:54 [block_pool.py:292] Successfully reset prefix cache
+
+ 40%|████      | 412/1024 [18:27:09<29:22:08, 172.76s/it][A
+                                                         [A{'loss': 0.0638, 'grad_norm': 0.0024831818882375956, 'learning_rate': 1e-05, 'num_tokens': 345472414.0, 'completions/mean_length': 7903.296875, 'completions/min_length': 1820.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 7629.7255859375, 'completions/min_terminated_length': 1820.0, 'completions/max_terminated_length': 15765.0, 'rewards/accuracy_reward/mean': 0.4765625, 'rewards/accuracy_reward/std': 0.5014128684997559, 'reward': 0.4765625, 'reward_std': 0.3243142366409302, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.020601853728294373, 'sampling/sampling_logp_difference/max': 6.974874973297119, 'sampling/importance_sampling_ratio/min': 0.0009350833133794367, 'sampling/importance_sampling_ratio/mean': 0.9999592900276184, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.943502850830555, 'clip_ratio/low_mean': 5.4418370382336434e-05, 'clip_ratio/low_min': 1.5258214943969506e-05, 'clip_ratio/high_mean': 3.1554840234093717e-06, 'clip_ratio/high_max': 1.2621936093637487e-05, 'clip_ratio/region_mean': 5.7573854519432643e-05, 'epoch': 0.38}
+
+ 40%|████      | 412/1024 [18:27:09<29:22:08, 172.76s/it][AINFO 12-02 18:40:42 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 18:40:42 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 18:40:42 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 18:40:42 [block_pool.py:292] Successfully reset prefix cache
+
+ 40%|████      | 413/1024 [18:29:49<28:41:39, 169.07s/it][A
+                                                         [A{'loss': 0.0724, 'grad_norm': 0.0032182165887206793, 'learning_rate': 1e-05, 'num_tokens': 346388112.0, 'completions/mean_length': 7016.890625, 'completions/min_length': 5.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 6943.1337890625, 'completions/min_terminated_length': 5.0, 'completions/max_terminated_length': 15211.0, 'rewards/accuracy_reward/mean': 0.421875, 'rewards/accuracy_reward/std': 0.4957992732524872, 'reward': 0.421875, 'reward_std': 0.3306122422218323, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.020366424694657326, 'sampling/sampling_logp_difference/max': 8.261494636535645, 'sampling/importance_sampling_ratio/min': 0.000258272688370198, 'sampling/importance_sampling_ratio/mean': 0.9999998807907104, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9670446068048477, 'clip_ratio/low_mean': 2.9159931841604703e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 9.173523380923143e-06, 'clip_ratio/high_max': 2.738965622484102e-05, 'clip_ratio/region_mean': 3.8333455336214683e-05, 'epoch': 0.38}
+
+ 40%|████      | 413/1024 [18:29:49<28:41:39, 169.07s/it][AINFO 12-02 18:43:22 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 18:43:22 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 18:43:22 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 18:43:22 [block_pool.py:292] Successfully reset prefix cache
+
+ 40%|████      | 414/1024 [18:32:47<29:04:58, 171.64s/it][A
+                                                         [A{'loss': 0.0724, 'grad_norm': 0.0023274575360119343, 'learning_rate': 1e-05, 'num_tokens': 347312071.0, 'completions/mean_length': 7043.8046875, 'completions/min_length': 1331.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 6819.64013671875, 'completions/min_terminated_length': 1331.0, 'completions/max_terminated_length': 16038.0, 'rewards/accuracy_reward/mean': 0.3671875, 'rewards/accuracy_reward/std': 0.4839322865009308, 'reward': 0.3671875, 'reward_std': 0.29538238048553467, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.021443769335746765, 'sampling/sampling_logp_difference/max': 6.607020378112793, 'sampling/importance_sampling_ratio/min': 0.0013508512638509274, 'sampling/importance_sampling_ratio/mean': 0.9999589323997498, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.022966854274273, 'clip_ratio/low_mean': 2.1530643095957203e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 1.5997748050722294e-06, 'clip_ratio/high_max': 6.399099220288917e-06, 'clip_ratio/region_mean': 2.3130417901029432e-05, 'epoch': 0.38}
+
+ 40%|████      | 414/1024 [18:32:47<29:04:58, 171.64s/it][AINFO 12-02 18:46:20 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 18:46:20 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 18:46:20 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 18:46:20 [block_pool.py:292] Successfully reset prefix cache
+
+ 41%|████      | 415/1024 [18:35:29<28:33:42, 168.84s/it][A
+                                                         [A{'loss': 0.0639, 'grad_norm': 0.0008460046374239028, 'learning_rate': 1e-05, 'num_tokens': 348161394.0, 'completions/mean_length': 6475.6484375, 'completions/min_length': 506.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 6318.37353515625, 'completions/min_terminated_length': 506.0, 'completions/max_terminated_length': 14908.0, 'rewards/accuracy_reward/mean': 0.4375, 'rewards/accuracy_reward/std': 0.49802759289741516, 'reward': 0.4375, 'reward_std': 0.22620806097984314, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.019627809524536133, 'sampling/sampling_logp_difference/max': 4.421597957611084, 'sampling/importance_sampling_ratio/min': 0.012015017680823803, 'sampling/importance_sampling_ratio/mean': 0.9999297261238098, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9873237758874893, 'clip_ratio/low_mean': 2.3596727601216116e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 4.7401289293702575e-06, 'clip_ratio/high_max': 1.896051571748103e-05, 'clip_ratio/region_mean': 2.83368563032127e-05, 'epoch': 0.38}
+
+ 41%|████      | 415/1024 [18:35:29<28:33:42, 168.84s/it][AINFO 12-02 18:49:02 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 18:49:02 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 18:49:02 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 18:49:02 [block_pool.py:292] Successfully reset prefix cache
+
+ 41%|████      | 416/1024 [18:38:23<28:47:21, 170.46s/it][A
+                                                         [A{'loss': 0.0758, 'grad_norm': 0.0017075197538360953, 'learning_rate': 1e-05, 'num_tokens': 349211078.0, 'completions/mean_length': 8038.90625, 'completions/min_length': 802.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0390625, 'completions/mean_terminated_length': 7699.67431640625, 'completions/min_terminated_length': 802.0, 'completions/max_terminated_length': 16243.0, 'rewards/accuracy_reward/mean': 0.328125, 'rewards/accuracy_reward/std': 0.4713755249977112, 'reward': 0.328125, 'reward_std': 0.30221715569496155, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.020795777440071106, 'sampling/sampling_logp_difference/max': 8.91385555267334, 'sampling/importance_sampling_ratio/min': 0.0001345122145721689, 'sampling/importance_sampling_ratio/mean': 1.000017523765564, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9513615965843201, 'clip_ratio/low_mean': 4.485099543671822e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 4.968342807387671e-06, 'clip_ratio/high_max': 1.9873371229550685e-05, 'clip_ratio/region_mean': 4.981933852832299e-05, 'epoch': 0.38}
+
+ 41%|████      | 416/1024 [18:38:23<28:47:21, 170.46s/it][AINFO 12-02 18:51:56 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 18:51:56 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 18:51:56 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 18:51:56 [block_pool.py:292] Successfully reset prefix cache
+
+ 41%|████      | 417/1024 [18:41:19<29:00:43, 172.06s/it][A
+                                                         [A{'loss': 0.0609, 'grad_norm': 0.0016014629509299994, 'learning_rate': 1e-05, 'num_tokens': 350171613.0, 'completions/mean_length': 7339.3046875, 'completions/min_length': 1002.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 7122.232421875, 'completions/min_terminated_length': 1002.0, 'completions/max_terminated_length': 16304.0, 'rewards/accuracy_reward/mean': 0.4453125, 'rewards/accuracy_reward/std': 0.4989531338214874, 'reward': 0.4453125, 'reward_std': 0.2698703408241272, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.021187925711274147, 'sampling/sampling_logp_difference/max': 18.101272583007812, 'sampling/importance_sampling_ratio/min': 1.3763129302901689e-08, 'sampling/importance_sampling_ratio/mean': 0.9999157190322876, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9872350245714188, 'clip_ratio/low_mean': 4.385826059660758e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 9.941148846337455e-07, 'clip_ratio/high_max': 3.976459538534982e-06, 'clip_ratio/region_mean': 4.485237468543346e-05, 'epoch': 0.38}
+
+ 41%|████      | 417/1024 [18:41:19<29:00:43, 172.06s/it][AINFO 12-02 18:54:52 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 18:54:52 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 18:54:52 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 18:54:52 [block_pool.py:292] Successfully reset prefix cache
+
+ 41%|████      | 418/1024 [18:43:56<28:11:19, 167.46s/it][A
+                                                         [A{'loss': 0.0038, 'grad_norm': 0.0025373264215886593, 'learning_rate': 1e-05, 'num_tokens': 351116803.0, 'completions/mean_length': 7239.359375, 'completions/min_length': 1294.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 7094.20654296875, 'completions/min_terminated_length': 1294.0, 'completions/max_terminated_length': 15341.0, 'rewards/accuracy_reward/mean': 0.5234375, 'rewards/accuracy_reward/std': 0.5014128684997559, 'reward': 0.5234375, 'reward_std': 0.24671243131160736, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.019912682473659515, 'sampling/sampling_logp_difference/max': 4.248099327087402, 'sampling/importance_sampling_ratio/min': 0.014291372150182724, 'sampling/importance_sampling_ratio/mean': 0.9999785423278809, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9430425837635994, 'clip_ratio/low_mean': 2.512099752038921e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 2.3236594870468252e-06, 'clip_ratio/high_max': 9.294637948187301e-06, 'clip_ratio/region_mean': 2.7444657121122873e-05, 'epoch': 0.38}
+
+ 41%|████      | 418/1024 [18:43:56<28:11:19, 167.46s/it][AINFO 12-02 18:57:29 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 18:57:29 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 18:57:29 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 18:57:29 [block_pool.py:292] Successfully reset prefix cache
+
+ 41%|████      | 419/1024 [18:46:52<28:34:07, 170.00s/it][A
+                                                         [A{'loss': 0.1042, 'grad_norm': 0.002612616401165724, 'learning_rate': 1e-05, 'num_tokens': 352059034.0, 'completions/mean_length': 7211.7421875, 'completions/min_length': 209.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 7066.1513671875, 'completions/min_terminated_length': 209.0, 'completions/max_terminated_length': 15833.0, 'rewards/accuracy_reward/mean': 0.625, 'rewards/accuracy_reward/std': 0.4860251843929291, 'reward': 0.625, 'reward_std': 0.327729195356369, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.01921844482421875, 'sampling/sampling_logp_difference/max': 12.87159538269043, 'sampling/importance_sampling_ratio/min': 2.5700239802972646e-06, 'sampling/importance_sampling_ratio/mean': 0.9999889731407166, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.841051459312439, 'clip_ratio/low_mean': 3.522799016764111e-05, 'clip_ratio/low_min': 6.063465662009548e-06, 'clip_ratio/high_mean': 5.310340270625602e-06, 'clip_ratio/high_max': 1.5709408671682468e-05, 'clip_ratio/region_mean': 4.053833055195355e-05, 'epoch': 0.39}
+
+ 41%|████      | 419/1024 [18:46:52<28:34:07, 170.00s/it][AINFO 12-02 19:00:25 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 19:00:25 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 19:00:25 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 19:00:25 [block_pool.py:292] Successfully reset prefix cache
+
+ 41%|████      | 420/1024 [18:49:23<27:33:11, 164.22s/it][A
+                                                         [A{'loss': 0.0363, 'grad_norm': 0.0019967984408140182, 'learning_rate': 1e-05, 'num_tokens': 352896219.0, 'completions/mean_length': 6374.6953125, 'completions/min_length': 693.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 6295.8818359375, 'completions/min_terminated_length': 693.0, 'completions/max_terminated_length': 14390.0, 'rewards/accuracy_reward/mean': 0.359375, 'rewards/accuracy_reward/std': 0.481702595949173, 'reward': 0.359375, 'reward_std': 0.19438526034355164, 'frac_reward_zero_std': 0.5625, 'sampling/sampling_logp_difference/mean': 0.02013232931494713, 'sampling/sampling_logp_difference/max': 4.295470237731934, 'sampling/importance_sampling_ratio/min': 0.020119966939091682, 'sampling/importance_sampling_ratio/mean': 0.999984860420227, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.0578313246369362, 'clip_ratio/low_mean': 1.714175300548959e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 1.799111203126813e-06, 'clip_ratio/high_max': 7.196444812507252e-06, 'clip_ratio/region_mean': 1.894086381071247e-05, 'epoch': 0.39}
+
+ 41%|████      | 420/1024 [18:49:23<27:33:11, 164.22s/it][AINFO 12-02 19:02:55 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 19:02:55 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 19:02:55 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 19:02:55 [block_pool.py:292] Successfully reset prefix cache
+
+ 41%|████      | 421/1024 [18:52:41<29:12:15, 174.35s/it][A
+                                                         [A{'loss': 0.067, 'grad_norm': 0.0018279170617461205, 'learning_rate': 1e-05, 'num_tokens': 353844990.0, 'completions/mean_length': 7274.7109375, 'completions/min_length': 1191.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0625, 'completions/mean_terminated_length': 6667.42529296875, 'completions/min_terminated_length': 1191.0, 'completions/max_terminated_length': 16313.0, 'rewards/accuracy_reward/mean': 0.53125, 'rewards/accuracy_reward/std': 0.5009832978248596, 'reward': 0.53125, 'reward_std': 0.29696235060691833, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.017373956739902496, 'sampling/sampling_logp_difference/max': 8.54503345489502, 'sampling/importance_sampling_ratio/min': 0.00019450874242465943, 'sampling/importance_sampling_ratio/mean': 0.9998984336853027, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.7415856420993805, 'clip_ratio/low_mean': 4.0026389058311906e-05, 'clip_ratio/low_min': 8.968050451585441e-06, 'clip_ratio/high_mean': 7.73928195485496e-06, 'clip_ratio/high_max': 3.095712781941984e-05, 'clip_ratio/region_mean': 4.7765669989985327e-05, 'epoch': 0.39}
+
+ 41%|████      | 421/1024 [18:52:41<29:12:15, 174.35s/it][AINFO 12-02 19:06:13 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 19:06:13 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 19:06:13 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 19:06:13 [block_pool.py:292] Successfully reset prefix cache
+
+ 41%|████      | 422/1024 [18:55:30<28:53:39, 172.79s/it][A
+                                                         [A{'loss': 0.0534, 'grad_norm': 0.001777544734068215, 'learning_rate': 1e-05, 'num_tokens': 354873933.0, 'completions/mean_length': 7902.9296875, 'completions/min_length': 339.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 7836.1494140625, 'completions/min_terminated_length': 339.0, 'completions/max_terminated_length': 15676.0, 'rewards/accuracy_reward/mean': 0.3125, 'rewards/accuracy_reward/std': 0.4653336703777313, 'reward': 0.3125, 'reward_std': 0.2835350036621094, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.021197015419602394, 'sampling/sampling_logp_difference/max': 6.903773307800293, 'sampling/importance_sampling_ratio/min': 0.001003989833407104, 'sampling/importance_sampling_ratio/mean': 0.9999098777770996, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.0019611343741417, 'clip_ratio/low_mean': 4.277909783922951e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 3.3981242495428887e-06, 'clip_ratio/high_max': 1.3592496998171555e-05, 'clip_ratio/region_mean': 4.6177221065590857e-05, 'epoch': 0.39}
+
+ 41%|████      | 422/1024 [18:55:30<28:53:39, 172.79s/it][AINFO 12-02 19:09:03 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 19:09:03 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 19:09:03 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 19:09:03 [block_pool.py:292] Successfully reset prefix cache
+
+ 41%|████▏     | 423/1024 [18:58:28<29:08:20, 174.54s/it][A
+                                                         [A{'loss': 0.0813, 'grad_norm': 0.00153827341273427, 'learning_rate': 1e-05, 'num_tokens': 355829507.0, 'completions/mean_length': 7304.046875, 'completions/min_length': 487.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0703125, 'completions/mean_terminated_length': 6617.328125, 'completions/min_terminated_length': 487.0, 'completions/max_terminated_length': 14700.0, 'rewards/accuracy_reward/mean': 0.3671875, 'rewards/accuracy_reward/std': 0.4839322865009308, 'reward': 0.3671875, 'reward_std': 0.2982654273509979, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.019642215222120285, 'sampling/sampling_logp_difference/max': 10.172656059265137, 'sampling/importance_sampling_ratio/min': 3.820072379312478e-05, 'sampling/importance_sampling_ratio/mean': 0.9999763369560242, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8584602400660515, 'clip_ratio/low_mean': 2.635721989463491e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 7.122522617919458e-06, 'clip_ratio/high_max': 2.524704336792638e-05, 'clip_ratio/region_mean': 3.347974279677146e-05, 'epoch': 0.39}
+
+ 41%|████▏     | 423/1024 [18:58:28<29:08:20, 174.54s/it][AINFO 12-02 19:12:01 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 19:12:01 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 19:12:01 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 19:12:01 [block_pool.py:292] Successfully reset prefix cache
+
+ 41%|████▏     | 424/1024 [19:01:05<28:12:00, 169.20s/it][A
+                                                         [A{'loss': 0.0311, 'grad_norm': 0.002107275417074561, 'learning_rate': 1e-05, 'num_tokens': 356573231.0, 'completions/mean_length': 5677.21875, 'completions/min_length': 76.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 5507.27001953125, 'completions/min_terminated_length': 76.0, 'completions/max_terminated_length': 15074.0, 'rewards/accuracy_reward/mean': 0.5, 'rewards/accuracy_reward/std': 0.5019646286964417, 'reward': 0.5, 'reward_std': 0.25354719161987305, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.020722679793834686, 'sampling/sampling_logp_difference/max': 6.243190288543701, 'sampling/importance_sampling_ratio/min': 0.0019436449510976672, 'sampling/importance_sampling_ratio/mean': 0.9999579191207886, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.0159753635525703, 'clip_ratio/low_mean': 3.204466929673799e-05, 'clip_ratio/low_min': 3.388819550309563e-06, 'clip_ratio/high_mean': 1.2564329381348216e-06, 'clip_ratio/high_max': 5.025731752539286e-06, 'clip_ratio/region_mean': 3.330110212118598e-05, 'epoch': 0.39}
+
+ 41%|████▏     | 424/1024 [19:01:05<28:12:00, 169.20s/it][AINFO 12-02 19:14:38 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 19:14:38 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 19:14:38 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 19:14:38 [block_pool.py:292] Successfully reset prefix cache
+
+ 42%|████▏     | 425/1024 [19:03:38<27:21:00, 164.37s/it][A
+                                                         [A{'loss': 0.0828, 'grad_norm': 0.001720887958072126, 'learning_rate': 1e-05, 'num_tokens': 357387169.0, 'completions/mean_length': 6209.078125, 'completions/min_length': 723.0, 'completions/max_length': 15407.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 6209.078125, 'completions/min_terminated_length': 723.0, 'completions/max_terminated_length': 15407.0, 'rewards/accuracy_reward/mean': 0.5703125, 'rewards/accuracy_reward/std': 0.4969765841960907, 'reward': 0.5703125, 'reward_std': 0.2409384548664093, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.017990771681070328, 'sampling/sampling_logp_difference/max': 12.374984741210938, 'sampling/importance_sampling_ratio/min': 4.222915777063463e-06, 'sampling/importance_sampling_ratio/mean': 1.000000238418579, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8867508247494698, 'clip_ratio/low_mean': 1.6582721229951858e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 3.685912929540791e-06, 'clip_ratio/high_max': 1.4743651718163164e-05, 'clip_ratio/region_mean': 2.0268634500553162e-05, 'epoch': 0.39}
+
+ 42%|████▏     | 425/1024 [19:03:38<27:21:00, 164.37s/it][AINFO 12-02 19:17:11 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 19:17:11 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 19:17:11 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 19:17:11 [block_pool.py:292] Successfully reset prefix cache
+
+ 42%|████▏     | 426/1024 [19:06:36<27:58:56, 168.46s/it][A
+                                                         [A{'loss': 0.0853, 'grad_norm': 0.0012448625639081001, 'learning_rate': 1e-05, 'num_tokens': 358334584.0, 'completions/mean_length': 7183.3671875, 'completions/min_length': 375.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 6962.55224609375, 'completions/min_terminated_length': 375.0, 'completions/max_terminated_length': 15845.0, 'rewards/accuracy_reward/mean': 0.328125, 'rewards/accuracy_reward/std': 0.4713755249977112, 'reward': 0.328125, 'reward_std': 0.17464719712734222, 'frac_reward_zero_std': 0.625, 'sampling/sampling_logp_difference/mean': 0.020646382123231888, 'sampling/sampling_logp_difference/max': 7.874577045440674, 'sampling/importance_sampling_ratio/min': 0.00038028976996429265, 'sampling/importance_sampling_ratio/mean': 0.9999426007270813, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9918289259076118, 'clip_ratio/low_mean': 1.6993449889923795e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 1.9675123894558055e-06, 'clip_ratio/high_max': 7.870049557823222e-06, 'clip_ratio/region_mean': 1.89609622793796e-05, 'epoch': 0.39}
+
+ 42%|████▏     | 426/1024 [19:06:36<27:58:56, 168.46s/it][AINFO 12-02 19:20:09 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 19:20:09 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 19:20:09 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 19:20:09 [block_pool.py:292] Successfully reset prefix cache
+
+ 42%|████▏     | 427/1024 [19:09:09<27:09:15, 163.74s/it][A
+                                                         [A{'loss': 0.0686, 'grad_norm': 0.001519464422017336, 'learning_rate': 1e-05, 'num_tokens': 359233451.0, 'completions/mean_length': 6877.1484375, 'completions/min_length': 2027.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 6802.29150390625, 'completions/min_terminated_length': 2027.0, 'completions/max_terminated_length': 15106.0, 'rewards/accuracy_reward/mean': 0.375, 'rewards/accuracy_reward/std': 0.4860251843929291, 'reward': 0.375, 'reward_std': 0.19438527524471283, 'frac_reward_zero_std': 0.5625, 'sampling/sampling_logp_difference/mean': 0.01951739378273487, 'sampling/sampling_logp_difference/max': 4.748780250549316, 'sampling/importance_sampling_ratio/min': 0.008662254549562931, 'sampling/importance_sampling_ratio/mean': 0.9998905658721924, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8806835636496544, 'clip_ratio/low_mean': 2.9313079608073167e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 0.0, 'clip_ratio/high_max': 0.0, 'clip_ratio/region_mean': 2.9313079608073167e-05, 'epoch': 0.39}
+
+ 42%|████▏     | 427/1024 [19:09:09<27:09:15, 163.74s/it][AINFO 12-02 19:22:42 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 19:22:42 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 19:22:42 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 19:22:42 [block_pool.py:292] Successfully reset prefix cache
+
+ 42%|████▏     | 428/1024 [19:12:00<27:28:19, 165.94s/it][A
+                                                         [A{'loss': 0.1217, 'grad_norm': 0.0022954042069613934, 'learning_rate': 1e-05, 'num_tokens': 360143003.0, 'completions/mean_length': 6964.6875, 'completions/min_length': 1148.0, 'completions/max_length': 16061.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 6964.6875, 'completions/min_terminated_length': 1148.0, 'completions/max_terminated_length': 16061.0, 'rewards/accuracy_reward/mean': 0.53125, 'rewards/accuracy_reward/std': 0.5009832978248596, 'reward': 0.53125, 'reward_std': 0.3253750801086426, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.01880607008934021, 'sampling/sampling_logp_difference/max': 8.499954223632812, 'sampling/importance_sampling_ratio/min': 0.00020347768440842628, 'sampling/importance_sampling_ratio/mean': 0.9999755620956421, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8069597631692886, 'clip_ratio/low_mean': 4.164141705587099e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 0.0, 'clip_ratio/high_max': 0.0, 'clip_ratio/region_mean': 4.164141705587099e-05, 'epoch': 0.39}
+
+ 42%|████▏     | 428/1024 [19:12:00<27:28:19, 165.94s/it][AINFO 12-02 19:25:33 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 19:25:33 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 19:25:33 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 19:25:33 [block_pool.py:292] Successfully reset prefix cache
+
+ 42%|████▏     | 429/1024 [19:14:59<28:04:18, 169.85s/it][A
+                                                         [A{'loss': 0.0094, 'grad_norm': 0.0021376016084104776, 'learning_rate': 1e-05, 'num_tokens': 361101379.0, 'completions/mean_length': 7322.0, 'completions/min_length': 901.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 7178.1591796875, 'completions/min_terminated_length': 901.0, 'completions/max_terminated_length': 16123.0, 'rewards/accuracy_reward/mean': 0.3046875, 'rewards/accuracy_reward/std': 0.46208351850509644, 'reward': 0.3046875, 'reward_std': 0.15308690071105957, 'frac_reward_zero_std': 0.625, 'sampling/sampling_logp_difference/mean': 0.021568164229393005, 'sampling/sampling_logp_difference/max': 9.069122314453125, 'sampling/importance_sampling_ratio/min': 0.00011516757513163611, 'sampling/importance_sampling_ratio/mean': 1.000006079673767, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.0852478593587875, 'clip_ratio/low_mean': 1.7779158497432945e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 0.0, 'clip_ratio/high_max': 0.0, 'clip_ratio/region_mean': 1.7779158497432945e-05, 'epoch': 0.39}
+
+ 42%|████▏     | 429/1024 [19:14:59<28:04:18, 169.85s/it][AINFO 12-02 19:28:32 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 19:28:32 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 19:28:32 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 19:28:32 [block_pool.py:292] Successfully reset prefix cache
+
+ 42%|████▏     | 430/1024 [19:17:54<28:15:33, 171.27s/it][A
+                                                         [A{'loss': 0.0875, 'grad_norm': 0.0031003563199192286, 'learning_rate': 1e-05, 'num_tokens': 362018284.0, 'completions/mean_length': 7027.0078125, 'completions/min_length': 967.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0390625, 'completions/mean_terminated_length': 6646.64208984375, 'completions/min_terminated_length': 967.0, 'completions/max_terminated_length': 15880.0, 'rewards/accuracy_reward/mean': 0.5, 'rewards/accuracy_reward/std': 0.5019646286964417, 'reward': 0.5, 'reward_std': 0.3243093490600586, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.020102323964238167, 'sampling/sampling_logp_difference/max': 6.873187065124512, 'sampling/importance_sampling_ratio/min': 0.0010351726086810231, 'sampling/importance_sampling_ratio/mean': 0.9999800324440002, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8932972475886345, 'clip_ratio/low_mean': 4.4633561628870666e-05, 'clip_ratio/low_min': 4.338168764661532e-06, 'clip_ratio/high_mean': 8.905177651286067e-06, 'clip_ratio/high_max': 3.1260904052032856e-05, 'clip_ratio/region_mean': 5.353873848434887e-05, 'epoch': 0.4}
+
+ 42%|████▏     | 430/1024 [19:17:54<28:15:33, 171.27s/it][AINFO 12-02 19:31:26 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 19:31:26 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 19:31:26 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 19:31:26 [block_pool.py:292] Successfully reset prefix cache
+
+ 42%|████▏     | 431/1024 [19:20:31<27:33:09, 167.27s/it][A
+                                                         [A{'loss': 0.0701, 'grad_norm': 0.0018177316524088383, 'learning_rate': 1e-05, 'num_tokens': 362851107.0, 'completions/mean_length': 6341.7421875, 'completions/min_length': 939.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 6262.66943359375, 'completions/min_terminated_length': 939.0, 'completions/max_terminated_length': 15430.0, 'rewards/accuracy_reward/mean': 0.5234375, 'rewards/accuracy_reward/std': 0.5014128684997559, 'reward': 0.5234375, 'reward_std': 0.28171277046203613, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.020463842898607254, 'sampling/sampling_logp_difference/max': 7.50155782699585, 'sampling/importance_sampling_ratio/min': 0.0005522234132513404, 'sampling/importance_sampling_ratio/mean': 0.999906063079834, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.885854922235012, 'clip_ratio/low_mean': 4.263560651907028e-05, 'clip_ratio/low_min': 1.8708525658439612e-05, 'clip_ratio/high_mean': 4.871089572588971e-06, 'clip_ratio/high_max': 1.5146189525694354e-05, 'clip_ratio/region_mean': 4.7506695409538224e-05, 'epoch': 0.4}
+
+ 42%|████▏     | 431/1024 [19:20:31<27:33:09, 167.27s/it][AINFO 12-02 19:34:04 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 19:34:04 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 19:34:04 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 19:34:04 [block_pool.py:292] Successfully reset prefix cache
+
+ 42%|████▏     | 432/1024 [19:23:03<26:44:16, 162.59s/it][A
+                                                         [A{'loss': 0.0503, 'grad_norm': 0.001520519028417766, 'learning_rate': 1e-05, 'num_tokens': 363691019.0, 'completions/mean_length': 6388.875, 'completions/min_length': 220.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 6310.17333984375, 'completions/min_terminated_length': 220.0, 'completions/max_terminated_length': 12672.0, 'rewards/accuracy_reward/mean': 0.328125, 'rewards/accuracy_reward/std': 0.4713755249977112, 'reward': 0.328125, 'reward_std': 0.2972046136856079, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.019675832241773605, 'sampling/sampling_logp_difference/max': 6.800905227661133, 'sampling/importance_sampling_ratio/min': 0.0011127673787996173, 'sampling/importance_sampling_ratio/mean': 1.0000056028366089, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9593783840537071, 'clip_ratio/low_mean': 3.242748857701372e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 5.9246351611363934e-06, 'clip_ratio/high_max': 1.9989562133559957e-05, 'clip_ratio/region_mean': 3.835212419289746e-05, 'epoch': 0.4}
+
+ 42%|████▏     | 432/1024 [19:23:03<26:44:16, 162.59s/it][AINFO 12-02 19:36:36 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 19:36:36 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 19:36:36 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 19:36:36 [block_pool.py:292] Successfully reset prefix cache
+
+ 42%|████▏     | 433/1024 [19:25:59<27:20:17, 166.53s/it][A
+                                                         [A{'loss': 0.026, 'grad_norm': 0.0016424815403297544, 'learning_rate': 1e-05, 'num_tokens': 364679475.0, 'completions/mean_length': 7559.875, 'completions/min_length': 1292.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.046875, 'completions/mean_terminated_length': 7125.9013671875, 'completions/min_terminated_length': 1292.0, 'completions/max_terminated_length': 15240.0, 'rewards/accuracy_reward/mean': 0.4765625, 'rewards/accuracy_reward/std': 0.5014128684997559, 'reward': 0.4765625, 'reward_std': 0.2409384697675705, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.01943236216902733, 'sampling/sampling_logp_difference/max': 8.244573593139648, 'sampling/importance_sampling_ratio/min': 0.00026268011424690485, 'sampling/importance_sampling_ratio/mean': 1.0000078678131104, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8298296853899956, 'clip_ratio/low_mean': 2.4561562668168335e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 0.0, 'clip_ratio/high_max': 0.0, 'clip_ratio/region_mean': 2.4561562668168335e-05, 'epoch': 0.4}
+
+ 42%|████▏     | 433/1024 [19:25:59<27:20:17, 166.53s/it][AINFO 12-02 19:39:32 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 19:39:32 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 19:39:32 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 19:39:32 [block_pool.py:292] Successfully reset prefix cache
+
+ 42%|████▏     | 434/1024 [19:28:31<26:34:09, 162.12s/it][A
+                                                         [A{'loss': 0.0383, 'grad_norm': 0.0020216500852257013, 'learning_rate': 1e-05, 'num_tokens': 365464588.0, 'completions/mean_length': 5993.1328125, 'completions/min_length': 461.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 5743.75244140625, 'completions/min_terminated_length': 461.0, 'completions/max_terminated_length': 16162.0, 'rewards/accuracy_reward/mean': 0.5078125, 'rewards/accuracy_reward/std': 0.5019033551216125, 'reward': 0.5078125, 'reward_std': 0.28353995084762573, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.0196966715157032, 'sampling/sampling_logp_difference/max': 7.352303504943848, 'sampling/importance_sampling_ratio/min': 0.0006411138456314802, 'sampling/importance_sampling_ratio/mean': 0.9999315142631531, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9428447484970093, 'clip_ratio/low_mean': 3.785217859331169e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 2.40527742789709e-06, 'clip_ratio/high_max': 9.62110971158836e-06, 'clip_ratio/region_mean': 4.025745568014827e-05, 'epoch': 0.4}
+
+ 42%|████▏     | 434/1024 [19:28:31<26:34:09, 162.12s/it][AINFO 12-02 19:42:04 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 19:42:04 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 19:42:04 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 19:42:04 [block_pool.py:292] Successfully reset prefix cache
+
+ 42%|████▏     | 435/1024 [19:31:08<26:17:17, 160.68s/it][A
+                                                         [A{'loss': 0.0816, 'grad_norm': 0.0017544744769111276, 'learning_rate': 1e-05, 'num_tokens': 366167481.0, 'completions/mean_length': 5325.0390625, 'completions/min_length': 530.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 5149.50048828125, 'completions/min_terminated_length': 530.0, 'completions/max_terminated_length': 15118.0, 'rewards/accuracy_reward/mean': 0.671875, 'rewards/accuracy_reward/std': 0.4713755249977112, 'reward': 0.671875, 'reward_std': 0.30091896653175354, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.01736798696219921, 'sampling/sampling_logp_difference/max': 7.121835708618164, 'sampling/importance_sampling_ratio/min': 0.0008072834461927414, 'sampling/importance_sampling_ratio/mean': 1.000058889389038, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.7730643972754478, 'clip_ratio/low_mean': 2.5303937945864163e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 3.3819919735833537e-06, 'clip_ratio/high_max': 1.3527967894333415e-05, 'clip_ratio/region_mean': 2.8685930146821192e-05, 'epoch': 0.4}
+
+ 42%|████▏     | 435/1024 [19:31:08<26:17:17, 160.68s/it][AINFO 12-02 19:44:41 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 19:44:41 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 19:44:41 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 19:44:41 [block_pool.py:292] Successfully reset prefix cache
+
+ 43%|████▎     | 436/1024 [19:33:49<26:15:22, 160.75s/it][A
+                                                         [A{'loss': 0.0445, 'grad_norm': 0.0013123912503942847, 'learning_rate': 1e-05, 'num_tokens': 367126948.0, 'completions/mean_length': 7331.3359375, 'completions/min_length': 1160.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 7114.072265625, 'completions/min_terminated_length': 1160.0, 'completions/max_terminated_length': 15321.0, 'rewards/accuracy_reward/mean': 0.453125, 'rewards/accuracy_reward/std': 0.4997538626194, 'reward': 0.453125, 'reward_std': 0.3243093490600586, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.020918458700180054, 'sampling/sampling_logp_difference/max': 6.062497138977051, 'sampling/importance_sampling_ratio/min': 0.0023285788483917713, 'sampling/importance_sampling_ratio/mean': 0.9999157786369324, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9418040588498116, 'clip_ratio/low_mean': 2.7905126785299217e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 3.1566120810566645e-06, 'clip_ratio/high_max': 8.82370454746706e-06, 'clip_ratio/region_mean': 3.1061739150572976e-05, 'epoch': 0.4}
+
+ 43%|████▎     | 436/1024 [19:33:49<26:15:22, 160.75s/it][AINFO 12-02 19:47:22 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 19:47:22 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 19:47:22 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 19:47:22 [block_pool.py:292] Successfully reset prefix cache
+
+ 43%|████▎     | 437/1024 [19:36:59<27:38:36, 169.53s/it][A
+                                                         [A{'loss': 0.0656, 'grad_norm': 0.0021181178744882345, 'learning_rate': 1e-05, 'num_tokens': 368071772.0, 'completions/mean_length': 7217.25, 'completions/min_length': 867.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0546875, 'completions/mean_terminated_length': 6686.94189453125, 'completions/min_terminated_length': 867.0, 'completions/max_terminated_length': 16209.0, 'rewards/accuracy_reward/mean': 0.453125, 'rewards/accuracy_reward/std': 0.4997538626194, 'reward': 0.453125, 'reward_std': 0.3593195080757141, 'frac_reward_zero_std': 0.125, 'sampling/sampling_logp_difference/mean': 0.020650038495659828, 'sampling/sampling_logp_difference/max': 7.299652099609375, 'sampling/importance_sampling_ratio/min': 0.000675773830153048, 'sampling/importance_sampling_ratio/mean': 0.9998823404312134, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9499563127756119, 'clip_ratio/low_mean': 4.105965246026244e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 4.1592835486881086e-06, 'clip_ratio/high_max': 1.6637134194752434e-05, 'clip_ratio/region_mean': 4.521893566789004e-05, 'epoch': 0.4}
+
+ 43%|████▎     | 437/1024 [19:36:59<27:38:36, 169.53s/it][AINFO 12-02 19:50:32 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 19:50:32 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 19:50:32 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 19:50:32 [block_pool.py:292] Successfully reset prefix cache
+
+ 43%|████▎     | 438/1024 [19:39:52<27:46:22, 170.62s/it][A
+                                                         [A{'loss': 0.0159, 'grad_norm': 0.0019147706916555762, 'learning_rate': 1e-05, 'num_tokens': 369055650.0, 'completions/mean_length': 7549.796875, 'completions/min_length': 1037.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 7264.822265625, 'completions/min_terminated_length': 1037.0, 'completions/max_terminated_length': 15669.0, 'rewards/accuracy_reward/mean': 0.359375, 'rewards/accuracy_reward/std': 0.481702595949173, 'reward': 0.359375, 'reward_std': 0.27328526973724365, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.02183394506573677, 'sampling/sampling_logp_difference/max': 6.885855674743652, 'sampling/importance_sampling_ratio/min': 0.0010221411939710379, 'sampling/importance_sampling_ratio/mean': 0.9999600648880005, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.0309365764260292, 'clip_ratio/low_mean': 3.4911336570075946e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 3.0107627253528335e-06, 'clip_ratio/high_max': 1.2043050901411334e-05, 'clip_ratio/region_mean': 3.792209963648929e-05, 'epoch': 0.4}
+
+ 43%|████▎     | 438/1024 [19:39:52<27:46:22, 170.62s/it][AINFO 12-02 19:53:25 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 19:53:25 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 19:53:25 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 19:53:25 [block_pool.py:292] Successfully reset prefix cache
+
+ 43%|████▎     | 439/1024 [19:42:30<27:06:43, 166.84s/it][A
+                                                         [A{'loss': 0.095, 'grad_norm': 0.0031485585495829582, 'learning_rate': 1e-05, 'num_tokens': 369938574.0, 'completions/mean_length': 6727.53125, 'completions/min_length': 1157.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 6651.49609375, 'completions/min_terminated_length': 1157.0, 'completions/max_terminated_length': 15269.0, 'rewards/accuracy_reward/mean': 0.4765625, 'rewards/accuracy_reward/std': 0.5014128684997559, 'reward': 0.4765625, 'reward_std': 0.3306073546409607, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.020127974450588226, 'sampling/sampling_logp_difference/max': 7.249527931213379, 'sampling/importance_sampling_ratio/min': 0.000710509717464447, 'sampling/importance_sampling_ratio/mean': 1.0000512599945068, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9676288217306137, 'clip_ratio/low_mean': 3.9529069113086734e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 9.858355269898311e-07, 'clip_ratio/high_max': 3.9433421079593245e-06, 'clip_ratio/region_mean': 4.051490452638973e-05, 'epoch': 0.4}
+
+ 43%|████▎     | 439/1024 [19:42:30<27:06:43, 166.84s/it][AINFO 12-02 19:56:03 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 19:56:03 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 19:56:03 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 19:56:03 [block_pool.py:292] Successfully reset prefix cache
+
+ 43%|████▎     | 440/1024 [19:45:12<26:49:45, 165.39s/it][A
+                                                         [A{'loss': 0.0332, 'grad_norm': 0.003415121464058757, 'learning_rate': 1e-05, 'num_tokens': 370760459.0, 'completions/mean_length': 6281.7265625, 'completions/min_length': 672.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 5955.8466796875, 'completions/min_terminated_length': 672.0, 'completions/max_terminated_length': 15864.0, 'rewards/accuracy_reward/mean': 0.375, 'rewards/accuracy_reward/std': 0.4860251843929291, 'reward': 0.375, 'reward_std': 0.2688094973564148, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.02063862606883049, 'sampling/sampling_logp_difference/max': 14.613539695739746, 'sampling/importance_sampling_ratio/min': 4.502153956309485e-07, 'sampling/importance_sampling_ratio/mean': 0.9999826550483704, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9817835092544556, 'clip_ratio/low_mean': 3.8503443363424594e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 2.5108157046815904e-06, 'clip_ratio/high_max': 1.0043262818726362e-05, 'clip_ratio/region_mean': 4.101425872704567e-05, 'epoch': 0.4}
+
+ 43%|████▎     | 440/1024 [19:45:12<26:49:45, 165.39s/it][AINFO 12-02 19:58:45 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 19:58:45 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 19:58:45 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 19:58:45 [block_pool.py:292] Successfully reset prefix cache
+
+ 43%|████▎     | 441/1024 [19:47:56<26:42:00, 164.87s/it][A
+                                                         [A{'loss': 0.0366, 'grad_norm': 0.00214037811383605, 'learning_rate': 1e-05, 'num_tokens': 371649103.0, 'completions/mean_length': 6765.71875, 'completions/min_length': 1444.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 6689.984375, 'completions/min_terminated_length': 1444.0, 'completions/max_terminated_length': 16272.0, 'rewards/accuracy_reward/mean': 0.4765625, 'rewards/accuracy_reward/std': 0.5014128684997559, 'reward': 0.4765625, 'reward_std': 0.1830746978521347, 'frac_reward_zero_std': 0.5625, 'sampling/sampling_logp_difference/mean': 0.02172943949699402, 'sampling/sampling_logp_difference/max': 6.249993801116943, 'sampling/importance_sampling_ratio/min': 0.001930466154590249, 'sampling/importance_sampling_ratio/mean': 0.9999620318412781, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.0270514711737633, 'clip_ratio/low_mean': 1.8947657395074202e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 7.003677183092805e-06, 'clip_ratio/high_max': 2.3593061087012757e-05, 'clip_ratio/region_mean': 2.5951335032914358e-05, 'epoch': 0.41}
+
+ 43%|████▎     | 441/1024 [19:47:56<26:42:00, 164.87s/it][AINFO 12-02 20:01:29 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 20:01:29 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 20:01:29 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 20:01:29 [block_pool.py:292] Successfully reset prefix cache
+
+ 43%|████▎     | 442/1024 [19:51:00<27:34:13, 170.54s/it][A
+                                                         [A{'loss': 0.0022, 'grad_norm': 0.003510556183755398, 'learning_rate': 1e-05, 'num_tokens': 372624535.0, 'completions/mean_length': 7481.625, 'completions/min_length': 782.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0546875, 'completions/mean_terminated_length': 6966.611328125, 'completions/min_terminated_length': 782.0, 'completions/max_terminated_length': 16159.0, 'rewards/accuracy_reward/mean': 0.4140625, 'rewards/accuracy_reward/std': 0.49449479579925537, 'reward': 0.4140625, 'reward_std': 0.3464162349700928, 'frac_reward_zero_std': 0.1875, 'sampling/sampling_logp_difference/mean': 0.02149931713938713, 'sampling/sampling_logp_difference/max': 7.894177436828613, 'sampling/importance_sampling_ratio/min': 0.0003729084855876863, 'sampling/importance_sampling_ratio/mean': 0.9999492168426514, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9730701074004173, 'clip_ratio/low_mean': 5.011202529203729e-05, 'clip_ratio/low_min': 3.1568047234031837e-06, 'clip_ratio/high_mean': 6.259035217226483e-06, 'clip_ratio/high_max': 2.1009727788623422e-05, 'clip_ratio/region_mean': 5.637106050926377e-05, 'epoch': 0.41}
+
+ 43%|████▎     | 442/1024 [19:51:00<27:34:13, 170.54s/it][AINFO 12-02 20:04:32 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 20:04:32 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 20:04:32 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 20:04:32 [block_pool.py:292] Successfully reset prefix cache
+
+ 43%|████▎     | 443/1024 [19:53:54<27:42:50, 171.72s/it][A
+                                                         [A{'loss': 0.062, 'grad_norm': 0.0018307552672922611, 'learning_rate': 1e-05, 'num_tokens': 373732962.0, 'completions/mean_length': 8474.2109375, 'completions/min_length': 983.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0390625, 'completions/mean_terminated_length': 8152.67431640625, 'completions/min_terminated_length': 983.0, 'completions/max_terminated_length': 15994.0, 'rewards/accuracy_reward/mean': 0.328125, 'rewards/accuracy_reward/std': 0.4713755249977112, 'reward': 0.328125, 'reward_std': 0.3214184641838074, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.02121492102742195, 'sampling/sampling_logp_difference/max': 15.624551773071289, 'sampling/importance_sampling_ratio/min': 1.6381112288854638e-07, 'sampling/importance_sampling_ratio/mean': 0.9999135732650757, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9761426225304604, 'clip_ratio/low_mean': 3.781230475397024e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 7.248057727338164e-07, 'clip_ratio/high_max': 2.8992230909352656e-06, 'clip_ratio/region_mean': 3.853711018564354e-05, 'epoch': 0.41}
+
+ 43%|████▎     | 443/1024 [19:53:54<27:42:50, 171.72s/it][AINFO 12-02 20:07:27 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 20:07:27 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 20:07:27 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 20:07:27 [block_pool.py:292] Successfully reset prefix cache
+
+ 43%|████▎     | 444/1024 [19:56:34<27:06:37, 168.27s/it][A
+                                                         [A{'loss': -0.0046, 'grad_norm': 0.0021801323164254427, 'learning_rate': 1e-05, 'num_tokens': 374706548.0, 'completions/mean_length': 7433.953125, 'completions/min_length': 1108.0, 'completions/max_length': 15117.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 7433.953125, 'completions/min_terminated_length': 1108.0, 'completions/max_terminated_length': 15117.0, 'rewards/accuracy_reward/mean': 0.421875, 'rewards/accuracy_reward/std': 0.4957992732524872, 'reward': 0.421875, 'reward_std': 0.2590789198875427, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.022046178579330444, 'sampling/sampling_logp_difference/max': 15.87417984008789, 'sampling/importance_sampling_ratio/min': 1.2762369294705422e-07, 'sampling/importance_sampling_ratio/mean': 0.9999595880508423, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.0665365010499954, 'clip_ratio/low_mean': 4.332422963670979e-05, 'clip_ratio/low_min': 5.504910404852126e-06, 'clip_ratio/high_mean': 1.9224487459723605e-06, 'clip_ratio/high_max': 7.689794983889442e-06, 'clip_ratio/region_mean': 4.5246677473187447e-05, 'epoch': 0.41}
+
+ 43%|████▎     | 444/1024 [19:56:34<27:06:37, 168.27s/it][AINFO 12-02 20:10:07 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 20:10:07 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 20:10:07 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 20:10:07 [block_pool.py:292] Successfully reset prefix cache
+
+ 43%|████▎     | 445/1024 [19:59:38<27:49:48, 173.04s/it][A
+                                                         [A{'loss': 0.0225, 'grad_norm': 0.0014674996491521597, 'learning_rate': 1e-05, 'num_tokens': 375706673.0, 'completions/mean_length': 7646.7265625, 'completions/min_length': 1019.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.046875, 'completions/mean_terminated_length': 7217.0244140625, 'completions/min_terminated_length': 1019.0, 'completions/max_terminated_length': 15989.0, 'rewards/accuracy_reward/mean': 0.328125, 'rewards/accuracy_reward/std': 0.4713755249977112, 'reward': 0.328125, 'reward_std': 0.1820138692855835, 'frac_reward_zero_std': 0.5625, 'sampling/sampling_logp_difference/mean': 0.018912145867943764, 'sampling/sampling_logp_difference/max': 12.899565696716309, 'sampling/importance_sampling_ratio/min': 2.4991354621306527e-06, 'sampling/importance_sampling_ratio/mean': 1.0000466108322144, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9308071210980415, 'clip_ratio/low_mean': 2.7811285235657124e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 0.0, 'clip_ratio/high_max': 0.0, 'clip_ratio/region_mean': 2.7811285235657124e-05, 'epoch': 0.41}
+
+ 43%|████▎     | 445/1024 [19:59:38<27:49:48, 173.04s/it][AINFO 12-02 20:13:11 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 20:13:11 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 20:13:11 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 20:13:11 [block_pool.py:292] Successfully reset prefix cache
+
+ 44%|████▎     | 446/1024 [20:02:10<26:45:43, 166.68s/it][A
+                                                         [A{'loss': 0.0489, 'grad_norm': 0.002320521976798773, 'learning_rate': 1e-05, 'num_tokens': 376506613.0, 'completions/mean_length': 6046.46875, 'completions/min_length': 305.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 5965.07080078125, 'completions/min_terminated_length': 305.0, 'completions/max_terminated_length': 15082.0, 'rewards/accuracy_reward/mean': 0.515625, 'rewards/accuracy_reward/std': 0.5017194747924805, 'reward': 0.515625, 'reward_std': 0.2835350036621094, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.018557455390691757, 'sampling/sampling_logp_difference/max': 10.611783027648926, 'sampling/importance_sampling_ratio/min': 2.462414704496041e-05, 'sampling/importance_sampling_ratio/mean': 0.9999364614486694, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9040833190083504, 'clip_ratio/low_mean': 2.5548037910994026e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 6.42428562969144e-06, 'clip_ratio/high_max': 2.569714251876576e-05, 'clip_ratio/region_mean': 3.1972323540685466e-05, 'epoch': 0.41}
+
+ 44%|████▎     | 446/1024 [20:02:10<26:45:43, 166.68s/it][AINFO 12-02 20:15:43 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 20:15:43 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 20:15:43 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 20:15:43 [block_pool.py:292] Successfully reset prefix cache
+
+ 44%|████▎     | 447/1024 [20:04:59<26:50:17, 167.45s/it][A
+                                                         [A{'loss': 0.1085, 'grad_norm': 0.004084885586053133, 'learning_rate': 1e-05, 'num_tokens': 377476249.0, 'completions/mean_length': 7435.53125, 'completions/min_length': 268.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 7220.7685546875, 'completions/min_terminated_length': 268.0, 'completions/max_terminated_length': 16110.0, 'rewards/accuracy_reward/mean': 0.3984375, 'rewards/accuracy_reward/std': 0.4915000796318054, 'reward': 0.3984375, 'reward_std': 0.29538238048553467, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.02096719481050968, 'sampling/sampling_logp_difference/max': 7.0617780685424805, 'sampling/importance_sampling_ratio/min': 0.0008572525111958385, 'sampling/importance_sampling_ratio/mean': 0.9999234676361084, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.0237125977873802, 'clip_ratio/low_mean': 5.543450777167891e-05, 'clip_ratio/low_min': 1.7309802160525578e-05, 'clip_ratio/high_mean': 4.301844171550329e-06, 'clip_ratio/high_max': 1.3460261698128306e-05, 'clip_ratio/region_mean': 5.973635086320428e-05, 'epoch': 0.41}
+
+ 44%|████▎     | 447/1024 [20:04:59<26:50:17, 167.45s/it][AINFO 12-02 20:18:32 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 20:18:32 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 20:18:32 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 20:18:32 [block_pool.py:292] Successfully reset prefix cache
+
+ 44%|████▍     | 448/1024 [20:07:50<26:54:56, 168.22s/it][A
+                                                         [A{'loss': 0.1291, 'grad_norm': 0.0025359690189361572, 'learning_rate': 1e-05, 'num_tokens': 378423338.0, 'completions/mean_length': 7228.1953125, 'completions/min_length': 501.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 7008.45654296875, 'completions/min_terminated_length': 501.0, 'completions/max_terminated_length': 16351.0, 'rewards/accuracy_reward/mean': 0.5078125, 'rewards/accuracy_reward/std': 0.5019033551216125, 'reward': 0.5078125, 'reward_std': 0.3182457685470581, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.020272942259907722, 'sampling/sampling_logp_difference/max': 8.250354766845703, 'sampling/importance_sampling_ratio/min': 0.00026116587105207145, 'sampling/importance_sampling_ratio/mean': 1.0000035762786865, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9323876351118088, 'clip_ratio/low_mean': 3.730497360265872e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 8.26576740564633e-06, 'clip_ratio/high_max': 2.7470227905723732e-05, 'clip_ratio/region_mean': 4.557074043987086e-05, 'epoch': 0.41}
+
+ 44%|████▍     | 448/1024 [20:07:50<26:54:56, 168.22s/it][AINFO 12-02 20:21:25 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 20:21:25 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 20:21:25 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 20:21:25 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 44%|████▍     | 449/1024 [20:11:01<28:00:19, 175.34s/it][A
+                                                         [A{'loss': 0.0477, 'grad_norm': 0.0007894439622759819, 'learning_rate': 1e-05, 'num_tokens': 379340811.0, 'completions/mean_length': 6997.0703125, 'completions/min_length': 1150.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0546875, 'completions/mean_terminated_length': 6454.0244140625, 'completions/min_terminated_length': 1150.0, 'completions/max_terminated_length': 16317.0, 'rewards/accuracy_reward/mean': 0.6484375, 'rewards/accuracy_reward/std': 0.4793342351913452, 'reward': 0.6484375, 'reward_std': 0.19674429297447205, 'frac_reward_zero_std': 0.5625, 'sampling/sampling_logp_difference/mean': 0.017771108075976372, 'sampling/sampling_logp_difference/max': 6.068617343902588, 'sampling/importance_sampling_ratio/min': 0.002314371056854725, 'sampling/importance_sampling_ratio/mean': 0.9999365210533142, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.7780926302075386, 'clip_ratio/low_mean': 2.8192220440814708e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 0.0, 'clip_ratio/high_max': 0.0, 'clip_ratio/region_mean': 2.8192220440814708e-05, 'epoch': 0.41}
+
+ 44%|████▍     | 449/1024 [20:11:01<28:00:19, 175.34s/it][AINFO 12-02 20:24:34 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 20:24:34 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 20:24:34 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 20:24:34 [block_pool.py:292] Successfully reset prefix cache
+
+ 44%|████▍     | 450/1024 [20:13:55<27:52:00, 174.77s/it][A
+                                                         [A{'loss': 0.0304, 'grad_norm': 0.0022072133142501116, 'learning_rate': 1e-05, 'num_tokens': 380262405.0, 'completions/mean_length': 7062.015625, 'completions/min_length': 608.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 6761.30615234375, 'completions/min_terminated_length': 608.0, 'completions/max_terminated_length': 15812.0, 'rewards/accuracy_reward/mean': 0.4296875, 'rewards/accuracy_reward/std': 0.4969765841960907, 'reward': 0.4296875, 'reward_std': 0.1922685205936432, 'frac_reward_zero_std': 0.5625, 'sampling/sampling_logp_difference/mean': 0.021276362240314484, 'sampling/sampling_logp_difference/max': 6.679624557495117, 'sampling/importance_sampling_ratio/min': 0.001256249495781958, 'sampling/importance_sampling_ratio/mean': 0.9999186992645264, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.033200815320015, 'clip_ratio/low_mean': 2.8732446423873625e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 9.369190934194194e-07, 'clip_ratio/high_max': 3.7476763736776775e-06, 'clip_ratio/region_mean': 2.9669366028883815e-05, 'epoch': 0.41}
+
+ 44%|████▍     | 450/1024 [20:13:55<27:52:00, 174.77s/it][AINFO 12-02 20:27:28 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 20:27:28 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 20:27:28 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 20:27:28 [block_pool.py:292] Successfully reset prefix cache
+
+ 44%|████▍     | 451/1024 [20:16:27<26:44:30, 168.01s/it][A
+                                                         [A{'loss': 0.0515, 'grad_norm': 0.0017955084331333637, 'learning_rate': 1e-05, 'num_tokens': 381119447.0, 'completions/mean_length': 6531.390625, 'completions/min_length': 696.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 6453.81103515625, 'completions/min_terminated_length': 696.0, 'completions/max_terminated_length': 15221.0, 'rewards/accuracy_reward/mean': 0.4296875, 'rewards/accuracy_reward/std': 0.4969765841960907, 'reward': 0.4296875, 'reward_std': 0.19833700358867645, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.021526070311665535, 'sampling/sampling_logp_difference/max': 5.85356330871582, 'sampling/importance_sampling_ratio/min': 0.002869655378162861, 'sampling/importance_sampling_ratio/mean': 1.0001041889190674, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.0753802210092545, 'clip_ratio/low_mean': 1.9905703766198712e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 2.0130514144511835e-06, 'clip_ratio/high_max': 8.052205657804734e-06, 'clip_ratio/region_mean': 2.1918755180649896e-05, 'epoch': 0.41}
+
+ 44%|████▍     | 451/1024 [20:16:27<26:44:30, 168.01s/it][AINFO 12-02 20:30:00 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 20:30:00 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 20:30:00 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 20:30:00 [block_pool.py:292] Successfully reset prefix cache
+
+ 44%|████▍     | 452/1024 [20:19:41<27:54:16, 175.62s/it][A
+                                                         [A{'loss': 0.0453, 'grad_norm': 0.002732889261096716, 'learning_rate': 1e-05, 'num_tokens': 382167791.0, 'completions/mean_length': 8017.1875, 'completions/min_length': 981.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.046875, 'completions/mean_terminated_length': 7605.70458984375, 'completions/min_terminated_length': 981.0, 'completions/max_terminated_length': 16120.0, 'rewards/accuracy_reward/mean': 0.40625, 'rewards/accuracy_reward/std': 0.4930621087551117, 'reward': 0.40625, 'reward_std': 0.26303553581237793, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.021047521382570267, 'sampling/sampling_logp_difference/max': 9.811981201171875, 'sampling/importance_sampling_ratio/min': 5.479118772200309e-05, 'sampling/importance_sampling_ratio/mean': 0.9999288320541382, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9964447841048241, 'clip_ratio/low_mean': 2.9879876365157543e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 3.18260617859778e-06, 'clip_ratio/high_max': 1.273042471439112e-05, 'clip_ratio/region_mean': 3.3062482771129e-05, 'epoch': 0.42}
+
+ 44%|████▍     | 452/1024 [20:19:41<27:54:16, 175.62s/it][AINFO 12-02 20:33:13 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 20:33:13 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 20:33:13 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 20:33:13 [block_pool.py:292] Successfully reset prefix cache
+
+ 44%|████▍     | 453/1024 [20:22:44<28:12:34, 177.85s/it][A
+                                                         [A{'loss': 0.0232, 'grad_norm': 0.0019144028192386031, 'learning_rate': 1e-05, 'num_tokens': 383141191.0, 'completions/mean_length': 7459.4375, 'completions/min_length': 771.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 7171.54833984375, 'completions/min_terminated_length': 771.0, 'completions/max_terminated_length': 15906.0, 'rewards/accuracy_reward/mean': 0.46875, 'rewards/accuracy_reward/std': 0.5009832978248596, 'reward': 0.46875, 'reward_std': 0.27328529953956604, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.021217871457338333, 'sampling/sampling_logp_difference/max': 4.64915132522583, 'sampling/importance_sampling_ratio/min': 0.00956972036510706, 'sampling/importance_sampling_ratio/mean': 0.9999465942382812, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.0056956112384796, 'clip_ratio/low_mean': 3.7889756640652195e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 1.2414094499035855e-06, 'clip_ratio/high_max': 4.965637799614342e-06, 'clip_ratio/region_mean': 3.913116597686894e-05, 'epoch': 0.42}
+
+ 44%|████▍     | 453/1024 [20:22:44<28:12:34, 177.85s/it][AINFO 12-02 20:36:16 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 20:36:16 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 20:36:16 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 20:36:16 [block_pool.py:292] Successfully reset prefix cache
+
+ 44%|████▍     | 454/1024 [20:26:05<29:16:56, 184.94s/it][A
+                                                         [A{'loss': 0.0275, 'grad_norm': 0.0016517839394509792, 'learning_rate': 1e-05, 'num_tokens': 384334334.0, 'completions/mean_length': 9159.3671875, 'completions/min_length': 1863.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 8985.9765625, 'completions/min_terminated_length': 1863.0, 'completions/max_terminated_length': 16333.0, 'rewards/accuracy_reward/mean': 0.3125, 'rewards/accuracy_reward/std': 0.4653336703777313, 'reward': 0.3125, 'reward_std': 0.2522490322589874, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.020924758166074753, 'sampling/sampling_logp_difference/max': 5.025218963623047, 'sampling/importance_sampling_ratio/min': 0.006570147816091776, 'sampling/importance_sampling_ratio/mean': 0.9999613761901855, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9315929487347603, 'clip_ratio/low_mean': 2.2691939818741957e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 5.362091030747251e-06, 'clip_ratio/high_max': 1.8957232214233954e-05, 'clip_ratio/region_mean': 2.8054030622115533e-05, 'epoch': 0.42}
+
+ 44%|████▍     | 454/1024 [20:26:05<29:16:56, 184.94s/it][AINFO 12-02 20:39:38 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 20:39:38 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 20:39:38 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 20:39:38 [block_pool.py:292] Successfully reset prefix cache
+
+ 44%|████▍     | 455/1024 [20:28:48<28:10:38, 178.27s/it][A
+                                                         [A{'loss': 0.0541, 'grad_norm': 0.0014509644825011492, 'learning_rate': 1e-05, 'num_tokens': 385302184.0, 'completions/mean_length': 7417.953125, 'completions/min_length': 906.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 7347.3544921875, 'completions/min_terminated_length': 906.0, 'completions/max_terminated_length': 15498.0, 'rewards/accuracy_reward/mean': 0.5078125, 'rewards/accuracy_reward/std': 0.5019033551216125, 'reward': 0.5078125, 'reward_std': 0.27092626690864563, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.021562211215496063, 'sampling/sampling_logp_difference/max': 8.243823051452637, 'sampling/importance_sampling_ratio/min': 0.0002628773218020797, 'sampling/importance_sampling_ratio/mean': 1.000024437904358, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9871305003762245, 'clip_ratio/low_mean': 2.6670725219446467e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 3.3729704114193737e-06, 'clip_ratio/high_max': 9.182131861962262e-06, 'clip_ratio/region_mean': 3.004369523296191e-05, 'epoch': 0.42}
+
+ 44%|████▍     | 455/1024 [20:28:48<28:10:38, 178.27s/it][AINFO 12-02 20:42:21 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 20:42:21 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 20:42:21 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 20:42:21 [block_pool.py:292] Successfully reset prefix cache
+
+ 45%|████▍     | 456/1024 [20:31:38<27:45:58, 175.98s/it][A
+                                                         [A{'loss': 0.0869, 'grad_norm': 0.0020068485755473375, 'learning_rate': 1e-05, 'num_tokens': 386196134.0, 'completions/mean_length': 6822.484375, 'completions/min_length': 783.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 6670.71484375, 'completions/min_terminated_length': 783.0, 'completions/max_terminated_length': 15221.0, 'rewards/accuracy_reward/mean': 0.4296875, 'rewards/accuracy_reward/std': 0.4969765841960907, 'reward': 0.4296875, 'reward_std': 0.2914257347583771, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.021426022052764893, 'sampling/sampling_logp_difference/max': 10.502419471740723, 'sampling/importance_sampling_ratio/min': 2.7469906854094006e-05, 'sampling/importance_sampling_ratio/mean': 0.9998727440834045, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9912217557430267, 'clip_ratio/low_mean': 3.5371780995774316e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 8.46860530145932e-07, 'clip_ratio/high_max': 3.387442120583728e-06, 'clip_ratio/region_mean': 3.621864152592025e-05, 'epoch': 0.42}
+
+ 45%|████▍     | 456/1024 [20:31:38<27:45:58, 175.98s/it][AINFO 12-02 20:45:11 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 20:45:11 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 20:45:11 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 20:45:11 [block_pool.py:292] Successfully reset prefix cache
+
+ 45%|████▍     | 457/1024 [20:35:05<29:08:51, 185.06s/it][A
+                                                         [A{'loss': 0.0892, 'grad_norm': 0.001828011590987444, 'learning_rate': 1e-05, 'num_tokens': 387193305.0, 'completions/mean_length': 7635.6484375, 'completions/min_length': 740.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 7425.6884765625, 'completions/min_terminated_length': 740.0, 'completions/max_terminated_length': 16326.0, 'rewards/accuracy_reward/mean': 0.4453125, 'rewards/accuracy_reward/std': 0.4989531338214874, 'reward': 0.4453125, 'reward_std': 0.24541424214839935, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.02074388600885868, 'sampling/sampling_logp_difference/max': 7.748712062835693, 'sampling/importance_sampling_ratio/min': 0.00043129766709171236, 'sampling/importance_sampling_ratio/mean': 1.0000317096710205, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9444101229310036, 'clip_ratio/low_mean': 3.6214845977156074e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 5.284073722577887e-06, 'clip_ratio/high_max': 2.1136294890311547e-05, 'clip_ratio/region_mean': 4.149891879023926e-05, 'epoch': 0.42}
+
+ 45%|████▍     | 457/1024 [20:35:05<29:08:51, 185.06s/it][AINFO 12-02 20:48:38 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 20:48:38 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 20:48:38 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 20:48:38 [block_pool.py:292] Successfully reset prefix cache
+
+ 45%|████▍     | 458/1024 [20:38:04<28:49:10, 183.30s/it][A
+                                                         [A{'loss': 0.0861, 'grad_norm': 0.0030686266254633665, 'learning_rate': 1e-05, 'num_tokens': 388134582.0, 'completions/mean_length': 7158.8515625, 'completions/min_length': 908.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 7012.4208984375, 'completions/min_terminated_length': 908.0, 'completions/max_terminated_length': 16282.0, 'rewards/accuracy_reward/mean': 0.4375, 'rewards/accuracy_reward/std': 0.49802759289741516, 'reward': 0.4375, 'reward_std': 0.3634909689426422, 'frac_reward_zero_std': 0.1875, 'sampling/sampling_logp_difference/mean': 0.021355487406253815, 'sampling/sampling_logp_difference/max': 6.370809078216553, 'sampling/importance_sampling_ratio/min': 0.0017107746098190546, 'sampling/importance_sampling_ratio/mean': 1.0000251531600952, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9376078173518181, 'clip_ratio/low_mean': 4.793580501427641e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 3.2275543730975187e-06, 'clip_ratio/high_max': 1.2910217492390075e-05, 'clip_ratio/region_mean': 5.116335842103581e-05, 'epoch': 0.42}
+
+ 45%|████▍     | 458/1024 [20:38:04<28:49:10, 183.30s/it][AINFO 12-02 20:51:37 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 20:51:37 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 20:51:37 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 20:51:37 [block_pool.py:292] Successfully reset prefix cache
+
+ 45%|████▍     | 459/1024 [20:40:57<28:17:09, 180.23s/it][A
+                                                         [A{'loss': 0.0848, 'grad_norm': 0.0013667051680386066, 'learning_rate': 1e-05, 'num_tokens': 389166536.0, 'completions/mean_length': 7911.015625, 'completions/min_length': 989.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0546875, 'completions/mean_terminated_length': 7420.8427734375, 'completions/min_terminated_length': 989.0, 'completions/max_terminated_length': 16195.0, 'rewards/accuracy_reward/mean': 0.4140625, 'rewards/accuracy_reward/std': 0.49449479579925537, 'reward': 0.4140625, 'reward_std': 0.22673209011554718, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.020827412605285645, 'sampling/sampling_logp_difference/max': 6.468746662139893, 'sampling/importance_sampling_ratio/min': 0.0015511686215177178, 'sampling/importance_sampling_ratio/mean': 1.000077247619629, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9648336246609688, 'clip_ratio/low_mean': 2.8521372371415055e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 4.276668107650039e-06, 'clip_ratio/high_max': 1.7106672430600156e-05, 'clip_ratio/region_mean': 3.279804064959535e-05, 'epoch': 0.42}
+
+ 45%|████▍     | 459/1024 [20:40:57<28:17:09, 180.23s/it][AINFO 12-02 20:54:30 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 20:54:30 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 20:54:30 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 20:54:30 [block_pool.py:292] Successfully reset prefix cache
+
+ 45%|████▍     | 460/1024 [20:43:43<27:35:20, 176.10s/it][A
+                                                         [A{'loss': 0.1496, 'grad_norm': 0.0033073534723371267, 'learning_rate': 1e-05, 'num_tokens': 390051215.0, 'completions/mean_length': 6767.3671875, 'completions/min_length': 903.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 6536.568359375, 'completions/min_terminated_length': 903.0, 'completions/max_terminated_length': 15887.0, 'rewards/accuracy_reward/mean': 0.421875, 'rewards/accuracy_reward/std': 0.4957992732524872, 'reward': 0.421875, 'reward_std': 0.3861120939254761, 'frac_reward_zero_std': 0.1875, 'sampling/sampling_logp_difference/mean': 0.01937835104763508, 'sampling/sampling_logp_difference/max': 9.37320613861084, 'sampling/importance_sampling_ratio/min': 8.497052476741374e-05, 'sampling/importance_sampling_ratio/mean': 0.999936580657959, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9320580661296844, 'clip_ratio/low_mean': 5.936963270869455e-05, 'clip_ratio/low_min': 3.850893335766159e-06, 'clip_ratio/high_mean': 3.2315410862793215e-06, 'clip_ratio/high_max': 1.009413745123311e-05, 'clip_ratio/region_mean': 6.260117379497387e-05, 'epoch': 0.42}
+
+ 45%|████▍     | 460/1024 [20:43:43<27:35:20, 176.10s/it][AINFO 12-02 20:57:16 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 20:57:16 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 20:57:16 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 20:57:16 [block_pool.py:292] Successfully reset prefix cache
+
+ 45%|████▌     | 461/1024 [20:46:32<27:12:00, 173.93s/it][A
+                                                         [A{'loss': 0.0572, 'grad_norm': 0.0011497536906972528, 'learning_rate': 1e-05, 'num_tokens': 390970312.0, 'completions/mean_length': 7026.1953125, 'completions/min_length': 1252.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 6877.6591796875, 'completions/min_terminated_length': 1252.0, 'completions/max_terminated_length': 15817.0, 'rewards/accuracy_reward/mean': 0.53125, 'rewards/accuracy_reward/std': 0.5009832978248596, 'reward': 0.53125, 'reward_std': 0.17623990774154663, 'frac_reward_zero_std': 0.5625, 'sampling/sampling_logp_difference/mean': 0.0198888648301363, 'sampling/sampling_logp_difference/max': 5.305865287780762, 'sampling/importance_sampling_ratio/min': 0.004962402395904064, 'sampling/importance_sampling_ratio/mean': 1.0000187158584595, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9348007142543793, 'clip_ratio/low_mean': 8.383698173020093e-06, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 4.565637027553748e-06, 'clip_ratio/high_max': 1.3921667232352775e-05, 'clip_ratio/region_mean': 1.2949335427947517e-05, 'epoch': 0.42}
+
+ 45%|████▌     | 461/1024 [20:46:32<27:12:00, 173.93s/it][AINFO 12-02 21:00:05 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 21:00:05 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 21:00:05 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 21:00:05 [block_pool.py:292] Successfully reset prefix cache
+
+ 45%|████▌     | 462/1024 [20:49:28<27:13:10, 174.36s/it][A
+                                                         [A{'loss': 0.0402, 'grad_norm': 0.0018075309926643968, 'learning_rate': 1e-05, 'num_tokens': 392049047.0, 'completions/mean_length': 8207.4921875, 'completions/min_length': 1306.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0625, 'completions/mean_terminated_length': 7680.39208984375, 'completions/min_terminated_length': 1306.0, 'completions/max_terminated_length': 16227.0, 'rewards/accuracy_reward/mean': 0.5, 'rewards/accuracy_reward/std': 0.5019646286964417, 'reward': 0.5, 'reward_std': 0.22567126154899597, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.022010331973433495, 'sampling/sampling_logp_difference/max': 10.775917053222656, 'sampling/importance_sampling_ratio/min': 2.0896746718790382e-05, 'sampling/importance_sampling_ratio/mean': 0.9999680519104004, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.0377789065241814, 'clip_ratio/low_mean': 1.9853792764479294e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 4.821525806164573e-06, 'clip_ratio/high_max': 1.6490183043060824e-05, 'clip_ratio/region_mean': 2.4675319082234637e-05, 'epoch': 0.43}
+
+ 45%|████▌     | 462/1024 [20:49:28<27:13:10, 174.36s/it][AINFO 12-02 21:03:00 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 21:03:00 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 21:03:00 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 21:03:01 [block_pool.py:292] Successfully reset prefix cache
+
+ 45%|████▌     | 463/1024 [20:52:07<26:29:11, 169.97s/it][A
+                                                         [A{'loss': 0.1041, 'grad_norm': 0.0026289531961083412, 'learning_rate': 1e-05, 'num_tokens': 392925154.0, 'completions/mean_length': 6708.5234375, 'completions/min_length': 1029.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0390625, 'completions/mean_terminated_length': 6315.2109375, 'completions/min_terminated_length': 1029.0, 'completions/max_terminated_length': 16023.0, 'rewards/accuracy_reward/mean': 0.5, 'rewards/accuracy_reward/std': 0.5019646286964417, 'reward': 0.5, 'reward_std': 0.39530590176582336, 'frac_reward_zero_std': 0.1875, 'sampling/sampling_logp_difference/mean': 0.018315661698579788, 'sampling/sampling_logp_difference/max': 9.99405574798584, 'sampling/importance_sampling_ratio/min': 4.567060386762023e-05, 'sampling/importance_sampling_ratio/mean': 0.9999178647994995, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8043844699859619, 'clip_ratio/low_mean': 3.467209478458244e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 8.289703544051008e-06, 'clip_ratio/high_max': 3.315881417620403e-05, 'clip_ratio/region_mean': 4.296179929497157e-05, 'epoch': 0.43}
+
+ 45%|████▌     | 463/1024 [20:52:07<26:29:11, 169.97s/it][AINFO 12-02 21:05:40 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 21:05:40 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 21:05:40 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 21:05:40 [block_pool.py:292] Successfully reset prefix cache
+
+ 45%|████▌     | 464/1024 [20:54:58<26:27:13, 170.06s/it][A
+                                                         [A{'loss': 0.0917, 'grad_norm': 0.0013463656650856137, 'learning_rate': 1e-05, 'num_tokens': 393802508.0, 'completions/mean_length': 6698.953125, 'completions/min_length': 493.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0546875, 'completions/mean_terminated_length': 6138.6611328125, 'completions/min_terminated_length': 493.0, 'completions/max_terminated_length': 16373.0, 'rewards/accuracy_reward/mean': 0.4921875, 'rewards/accuracy_reward/std': 0.5019033551216125, 'reward': 0.4921875, 'reward_std': 0.27222445607185364, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.021347172558307648, 'sampling/sampling_logp_difference/max': 7.162531852722168, 'sampling/importance_sampling_ratio/min': 0.0007750896620564163, 'sampling/importance_sampling_ratio/mean': 1.000024676322937, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9281814023852348, 'clip_ratio/low_mean': 4.699321112866528e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 2.7556933446248877e-06, 'clip_ratio/high_max': 1.102277337849955e-05, 'clip_ratio/region_mean': 4.9748904586977005e-05, 'epoch': 0.43}
+
+ 45%|████▌     | 464/1024 [20:54:58<26:27:13, 170.06s/it][AINFO 12-02 21:08:30 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 21:08:30 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 21:08:31 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 21:08:31 [block_pool.py:292] Successfully reset prefix cache
+
+ 45%|████▌     | 465/1024 [20:57:35<25:48:05, 166.16s/it][A
+                                                         [A{'loss': 0.0083, 'grad_norm': 0.002612709766253829, 'learning_rate': 1e-05, 'num_tokens': 394658497.0, 'completions/mean_length': 6545.1640625, 'completions/min_length': 978.0, 'completions/max_length': 15739.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 6545.1640625, 'completions/min_terminated_length': 978.0, 'completions/max_terminated_length': 15739.0, 'rewards/accuracy_reward/mean': 0.484375, 'rewards/accuracy_reward/std': 0.5017194747924805, 'reward': 0.484375, 'reward_std': 0.22962790727615356, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.022003278136253357, 'sampling/sampling_logp_difference/max': 7.186861038208008, 'sampling/importance_sampling_ratio/min': 0.0007564599509350955, 'sampling/importance_sampling_ratio/mean': 0.9999566078186035, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9567619636654854, 'clip_ratio/low_mean': 4.3550548753046314e-05, 'clip_ratio/low_min': 2.836203520928393e-06, 'clip_ratio/high_mean': 4.292386165616335e-06, 'clip_ratio/high_max': 1.4214670954970643e-05, 'clip_ratio/region_mean': 4.784293491866265e-05, 'epoch': 0.43}
+
+ 45%|████▌     | 465/1024 [20:57:35<25:48:05, 166.16s/it][AINFO 12-02 21:11:08 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 21:11:08 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 21:11:08 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 21:11:08 [block_pool.py:292] Successfully reset prefix cache
+
+ 46%|████▌     | 466/1024 [21:00:04<24:59:38, 161.25s/it][A
+                                                         [A{'loss': 0.0008, 'grad_norm': 0.0005542136495932937, 'learning_rate': 1e-05, 'num_tokens': 395446972.0, 'completions/mean_length': 6006.0859375, 'completions/min_length': 735.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 5757.01611328125, 'completions/min_terminated_length': 735.0, 'completions/max_terminated_length': 14111.0, 'rewards/accuracy_reward/mean': 0.3671875, 'rewards/accuracy_reward/std': 0.4839322865009308, 'reward': 0.3671875, 'reward_std': 0.18884867429733276, 'frac_reward_zero_std': 0.5625, 'sampling/sampling_logp_difference/mean': 0.019080644473433495, 'sampling/sampling_logp_difference/max': 8.687469482421875, 'sampling/importance_sampling_ratio/min': 0.00016868635430000722, 'sampling/importance_sampling_ratio/mean': 0.9998357892036438, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9361230358481407, 'clip_ratio/low_mean': 1.425048890268954e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 1.5081318451848347e-06, 'clip_ratio/high_max': 6.032527380739339e-06, 'clip_ratio/region_mean': 1.57586205205007e-05, 'epoch': 0.43}
+
+ 46%|████▌     | 466/1024 [21:00:04<24:59:38, 161.25s/it][AINFO 12-02 21:13:37 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 21:13:37 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 21:13:37 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 21:13:37 [block_pool.py:292] Successfully reset prefix cache
+
+ 46%|████▌     | 467/1024 [21:03:23<26:41:17, 172.49s/it][A
+                                                         [A{'loss': 0.0417, 'grad_norm': 0.0009884496685117483, 'learning_rate': 1e-05, 'num_tokens': 396567557.0, 'completions/mean_length': 8591.0078125, 'completions/min_length': 1581.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0390625, 'completions/mean_terminated_length': 8274.21875, 'completions/min_terminated_length': 1581.0, 'completions/max_terminated_length': 15793.0, 'rewards/accuracy_reward/mean': 0.2734375, 'rewards/accuracy_reward/std': 0.447474867105484, 'reward': 0.2734375, 'reward_std': 0.20753079652786255, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.022461744025349617, 'sampling/sampling_logp_difference/max': 4.075876712799072, 'sampling/importance_sampling_ratio/min': 0.016977323219180107, 'sampling/importance_sampling_ratio/mean': 0.9999769926071167, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.0343038365244865, 'clip_ratio/low_mean': 3.96561113120697e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 1.7544857087159471e-06, 'clip_ratio/high_max': 7.0179428348637884e-06, 'clip_ratio/region_mean': 4.1410597532376414e-05, 'epoch': 0.43}
+
+ 46%|████▌     | 467/1024 [21:03:23<26:41:17, 172.49s/it][AINFO 12-02 21:16:56 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 21:16:56 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 21:16:56 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 21:16:56 [block_pool.py:292] Successfully reset prefix cache
+
+ 46%|████▌     | 468/1024 [21:06:22<26:54:47, 174.26s/it][A
+                                                         [A{'loss': 0.0727, 'grad_norm': 0.0031053642742335796, 'learning_rate': 1e-05, 'num_tokens': 397564362.0, 'completions/mean_length': 7650.1640625, 'completions/min_length': 1292.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0390625, 'completions/mean_terminated_length': 7295.1298828125, 'completions/min_terminated_length': 1292.0, 'completions/max_terminated_length': 15014.0, 'rewards/accuracy_reward/mean': 0.421875, 'rewards/accuracy_reward/std': 0.4957992732524872, 'reward': 0.421875, 'reward_std': 0.31246688961982727, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.021740511059761047, 'sampling/sampling_logp_difference/max': 5.934356212615967, 'sampling/importance_sampling_ratio/min': 0.0026469260919839144, 'sampling/importance_sampling_ratio/mean': 1.0000197887420654, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9868866801261902, 'clip_ratio/low_mean': 4.859435853177274e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 3.267530360062665e-06, 'clip_ratio/high_max': 9.630196018406423e-06, 'clip_ratio/region_mean': 5.186188877814857e-05, 'epoch': 0.43}
+
+ 46%|████▌     | 468/1024 [21:06:22<26:54:47, 174.26s/it][AINFO 12-02 21:19:54 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 21:19:54 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 21:19:54 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 21:19:54 [block_pool.py:292] Successfully reset prefix cache
+
+ 46%|████▌     | 469/1024 [21:09:11<26:38:00, 172.76s/it][A
+                                                         [A{'loss': 0.1011, 'grad_norm': 0.002716085175052285, 'learning_rate': 1e-05, 'num_tokens': 398516179.0, 'completions/mean_length': 7266.4453125, 'completions/min_length': 489.0, 'completions/max_length': 15951.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 7266.4453125, 'completions/min_terminated_length': 489.0, 'completions/max_terminated_length': 15951.0, 'rewards/accuracy_reward/mean': 0.296875, 'rewards/accuracy_reward/std': 0.45867621898651123, 'reward': 0.296875, 'reward_std': 0.34139877557754517, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.020849108695983887, 'sampling/sampling_logp_difference/max': 14.288839340209961, 'sampling/importance_sampling_ratio/min': 6.229252562661713e-07, 'sampling/importance_sampling_ratio/mean': 0.9999359846115112, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9516391009092331, 'clip_ratio/low_mean': 5.676441310242808e-05, 'clip_ratio/low_min': 8.211187832785072e-06, 'clip_ratio/high_mean': 1.6887177594071545e-06, 'clip_ratio/high_max': 6.754871037628618e-06, 'clip_ratio/region_mean': 5.8453130804991815e-05, 'epoch': 0.43}
+
+ 46%|████▌     | 469/1024 [21:09:11<26:38:00, 172.76s/it][AINFO 12-02 21:22:44 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 21:22:44 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 21:22:44 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 21:22:44 [block_pool.py:292] Successfully reset prefix cache
+
+ 46%|████▌     | 470/1024 [21:11:47<25:50:20, 167.91s/it][A
+                                                         [A{'loss': 0.0718, 'grad_norm': 0.0028007710352540016, 'learning_rate': 1e-05, 'num_tokens': 399270487.0, 'completions/mean_length': 5749.46875, 'completions/min_length': 724.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 5494.240234375, 'completions/min_terminated_length': 724.0, 'completions/max_terminated_length': 16233.0, 'rewards/accuracy_reward/mean': 0.296875, 'rewards/accuracy_reward/std': 0.45867621898651123, 'reward': 0.296875, 'reward_std': 0.25118330121040344, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.02107393927872181, 'sampling/sampling_logp_difference/max': 6.749991416931152, 'sampling/importance_sampling_ratio/min': 0.0011708897072821856, 'sampling/importance_sampling_ratio/mean': 1.0000059604644775, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.0874007269740105, 'clip_ratio/low_mean': 2.9946442737127654e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 0.0, 'clip_ratio/high_max': 0.0, 'clip_ratio/region_mean': 2.9946442737127654e-05, 'epoch': 0.43}
+
+ 46%|████▌     | 470/1024 [21:11:47<25:50:20, 167.91s/it][AINFO 12-02 21:25:20 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 21:25:20 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 21:25:20 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 21:25:20 [block_pool.py:292] Successfully reset prefix cache
+
+ 46%|████▌     | 471/1024 [21:14:37<25:52:20, 168.43s/it][A
+                                                         [A{'loss': 0.0425, 'grad_norm': 0.001793393399566412, 'learning_rate': 1e-05, 'num_tokens': 400268505.0, 'completions/mean_length': 7647.703125, 'completions/min_length': 912.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 7438.0322265625, 'completions/min_terminated_length': 912.0, 'completions/max_terminated_length': 16338.0, 'rewards/accuracy_reward/mean': 0.46875, 'rewards/accuracy_reward/std': 0.5009832978248596, 'reward': 0.46875, 'reward_std': 0.26303553581237793, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.02151917666196823, 'sampling/sampling_logp_difference/max': 6.313166618347168, 'sampling/importance_sampling_ratio/min': 0.0018122853944078088, 'sampling/importance_sampling_ratio/mean': 0.999920129776001, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9148579463362694, 'clip_ratio/low_mean': 5.202002398618788e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 8.784752481005853e-07, 'clip_ratio/high_max': 3.5139009924023412e-06, 'clip_ratio/region_mean': 5.2898499234288465e-05, 'epoch': 0.43}
+
+ 46%|████▌     | 471/1024 [21:14:37<25:52:20, 168.43s/it][AINFO 12-02 21:28:10 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 21:28:10 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 21:28:10 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 21:28:10 [block_pool.py:292] Successfully reset prefix cache
+
+ 46%|████▌     | 472/1024 [21:17:26<25:51:54, 168.69s/it][A
+                                                         [A{'loss': 0.028, 'grad_norm': 0.0013443480711430311, 'learning_rate': 1e-05, 'num_tokens': 401147232.0, 'completions/mean_length': 6693.4296875, 'completions/min_length': 298.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 6460.8564453125, 'completions/min_terminated_length': 298.0, 'completions/max_terminated_length': 16073.0, 'rewards/accuracy_reward/mean': 0.3515625, 'rewards/accuracy_reward/std': 0.4793342351913452, 'reward': 0.3515625, 'reward_std': 0.251188188791275, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.019427888095378876, 'sampling/sampling_logp_difference/max': 12.951128959655762, 'sampling/importance_sampling_ratio/min': 2.3735378817946184e-06, 'sampling/importance_sampling_ratio/mean': 0.999970555305481, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9448740482330322, 'clip_ratio/low_mean': 3.2884086522244615e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 0.0, 'clip_ratio/high_max': 0.0, 'clip_ratio/region_mean': 3.2884086522244615e-05, 'epoch': 0.43}
+
+ 46%|████▌     | 472/1024 [21:17:26<25:51:54, 168.69s/it][AINFO 12-02 21:30:59 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 21:30:59 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 21:30:59 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 21:30:59 [block_pool.py:292] Successfully reset prefix cache
+
+ 46%|████▌     | 473/1024 [21:20:37<26:48:57, 175.20s/it][A
+                                                         [A{'loss': 0.0483, 'grad_norm': 0.002113877795636654, 'learning_rate': 1e-05, 'num_tokens': 402163278.0, 'completions/mean_length': 7770.421875, 'completions/min_length': 926.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.078125, 'completions/mean_terminated_length': 7040.45751953125, 'completions/min_terminated_length': 926.0, 'completions/max_terminated_length': 16257.0, 'rewards/accuracy_reward/mean': 0.4609375, 'rewards/accuracy_reward/std': 0.5004304051399231, 'reward': 0.4609375, 'reward_std': 0.28171277046203613, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.01941337063908577, 'sampling/sampling_logp_difference/max': 8.426445007324219, 'sampling/importance_sampling_ratio/min': 0.00021899864077568054, 'sampling/importance_sampling_ratio/mean': 0.9999215602874756, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8205523490905762, 'clip_ratio/low_mean': 4.75153649404092e-05, 'clip_ratio/low_min': 3.133931613774621e-06, 'clip_ratio/high_mean': 7.69495954955346e-07, 'clip_ratio/high_max': 3.077983819821384e-06, 'clip_ratio/region_mean': 4.828486112273822e-05, 'epoch': 0.44}
+
+ 46%|████▌     | 473/1024 [21:20:37<26:48:57, 175.20s/it][AINFO 12-02 21:34:10 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 21:34:10 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 21:34:10 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 21:34:10 [block_pool.py:292] Successfully reset prefix cache
+
+ 46%|████▋     | 474/1024 [21:23:44<27:18:23, 178.73s/it][A
+                                                         [A{'loss': 0.0336, 'grad_norm': 0.0018291162559762597, 'learning_rate': 1e-05, 'num_tokens': 403312759.0, 'completions/mean_length': 8825.5703125, 'completions/min_length': 1298.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0703125, 'completions/mean_terminated_length': 8253.9248046875, 'completions/min_terminated_length': 1298.0, 'completions/max_terminated_length': 16339.0, 'rewards/accuracy_reward/mean': 0.2265625, 'rewards/accuracy_reward/std': 0.4202519655227661, 'reward': 0.2265625, 'reward_std': 0.2767002284526825, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.022145045921206474, 'sampling/sampling_logp_difference/max': 11.262812614440918, 'sampling/importance_sampling_ratio/min': 1.2841703210142441e-05, 'sampling/importance_sampling_ratio/mean': 1.0, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.0104900375008583, 'clip_ratio/low_mean': 3.358103390382894e-05, 'clip_ratio/low_min': 4.408238055475522e-06, 'clip_ratio/high_mean': 2.8587630822585197e-06, 'clip_ratio/high_max': 1.1435052329034079e-05, 'clip_ratio/region_mean': 3.6439797213461134e-05, 'epoch': 0.44}
+
+ 46%|████▋     | 474/1024 [21:23:44<27:18:23, 178.73s/it][AINFO 12-02 21:37:17 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 21:37:17 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 21:37:17 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 21:37:17 [block_pool.py:292] Successfully reset prefix cache
+
+ 46%|████▋     | 475/1024 [21:26:23<26:21:20, 172.82s/it][A
+                                                         [A{'loss': 0.0222, 'grad_norm': 0.0025468948297202587, 'learning_rate': 1e-05, 'num_tokens': 404216281.0, 'completions/mean_length': 6909.328125, 'completions/min_length': 1033.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 6758.93701171875, 'completions/min_terminated_length': 1033.0, 'completions/max_terminated_length': 15219.0, 'rewards/accuracy_reward/mean': 0.453125, 'rewards/accuracy_reward/std': 0.4997538626194, 'reward': 0.453125, 'reward_std': 0.20805485546588898, 'frac_reward_zero_std': 0.5625, 'sampling/sampling_logp_difference/mean': 0.022742662578821182, 'sampling/sampling_logp_difference/max': 13.623628616333008, 'sampling/importance_sampling_ratio/min': 1.2115274330426473e-06, 'sampling/importance_sampling_ratio/mean': 1.0000258684158325, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.1106521785259247, 'clip_ratio/low_mean': 2.1927354509898578e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 4.8177594180742744e-06, 'clip_ratio/high_max': 1.9271037672297098e-05, 'clip_ratio/region_mean': 2.67451134732255e-05, 'epoch': 0.44}
+
+ 46%|████▋     | 475/1024 [21:26:23<26:21:20, 172.82s/it][AINFO 12-02 21:39:56 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 21:39:56 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 21:39:56 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 21:39:56 [block_pool.py:292] Successfully reset prefix cache
+
+ 46%|████▋     | 476/1024 [21:29:11<26:04:49, 171.33s/it][A
+                                                         [A{'loss': 0.0308, 'grad_norm': 0.0014530897606164217, 'learning_rate': 1e-05, 'num_tokens': 405169374.0, 'completions/mean_length': 7287.1015625, 'completions/min_length': 491.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0390625, 'completions/mean_terminated_length': 6917.30859375, 'completions/min_terminated_length': 491.0, 'completions/max_terminated_length': 15951.0, 'rewards/accuracy_reward/mean': 0.265625, 'rewards/accuracy_reward/std': 0.44340085983276367, 'reward': 0.265625, 'reward_std': 0.21436560153961182, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.02199842408299446, 'sampling/sampling_logp_difference/max': 4.618121147155762, 'sampling/importance_sampling_ratio/min': 0.009871325455605984, 'sampling/importance_sampling_ratio/mean': 0.9999035596847534, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.0775608494877815, 'clip_ratio/low_mean': 2.58185752954887e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 2.0197085177642293e-06, 'clip_ratio/high_max': 4.1670832615636755e-06, 'clip_ratio/region_mean': 2.7838283585879253e-05, 'epoch': 0.44}
+
+ 46%|████▋     | 476/1024 [21:29:11<26:04:49, 171.33s/it][AINFO 12-02 21:42:43 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 21:42:43 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 21:42:44 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 21:42:44 [block_pool.py:292] Successfully reset prefix cache
+
+ 47%|████▋     | 477/1024 [21:31:41<25:04:45, 165.06s/it][A
+                                                         [A{'loss': 0.1186, 'grad_norm': 0.002122317673638463, 'learning_rate': 1e-05, 'num_tokens': 405971889.0, 'completions/mean_length': 6115.7109375, 'completions/min_length': 1178.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 6034.8583984375, 'completions/min_terminated_length': 1178.0, 'completions/max_terminated_length': 15937.0, 'rewards/accuracy_reward/mean': 0.5390625, 'rewards/accuracy_reward/std': 0.5004304051399231, 'reward': 0.5390625, 'reward_std': 0.3056321144104004, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.019933655858039856, 'sampling/sampling_logp_difference/max': 7.241690635681152, 'sampling/importance_sampling_ratio/min': 0.0007161000976338983, 'sampling/importance_sampling_ratio/mean': 0.9999507665634155, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.96979720890522, 'clip_ratio/low_mean': 4.640194219973637e-05, 'clip_ratio/low_min': 4.490910214371979e-06, 'clip_ratio/high_mean': 5.712629331355856e-06, 'clip_ratio/high_max': 1.9925172182411188e-05, 'clip_ratio/region_mean': 5.211457164477906e-05, 'epoch': 0.44}
+
+ 47%|████▋     | 477/1024 [21:31:41<25:04:45, 165.06s/it][AINFO 12-02 21:45:14 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 21:45:14 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 21:45:14 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 21:45:14 [block_pool.py:292] Successfully reset prefix cache
+
+ 47%|████▋     | 478/1024 [21:34:45<25:53:49, 170.75s/it][A
+                                                         [A{'loss': 0.0421, 'grad_norm': 0.0024198584724217653, 'learning_rate': 1e-05, 'num_tokens': 406917473.0, 'completions/mean_length': 7231.875, 'completions/min_length': 1301.0, 'completions/max_length': 16012.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 7231.875, 'completions/min_terminated_length': 1301.0, 'completions/max_terminated_length': 16012.0, 'rewards/accuracy_reward/mean': 0.34375, 'rewards/accuracy_reward/std': 0.47682511806488037, 'reward': 0.34375, 'reward_std': 0.19568344950675964, 'frac_reward_zero_std': 0.5625, 'sampling/sampling_logp_difference/mean': 0.020445439964532852, 'sampling/sampling_logp_difference/max': 8.854079246520996, 'sampling/importance_sampling_ratio/min': 0.0001427980314474553, 'sampling/importance_sampling_ratio/mean': 0.9999538064002991, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9227506220340729, 'clip_ratio/low_mean': 3.373004710738314e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 1.996677610804909e-06, 'clip_ratio/high_max': 7.986710443219636e-06, 'clip_ratio/region_mean': 3.572672494556173e-05, 'epoch': 0.44}
+
+ 47%|████▋     | 478/1024 [21:34:45<25:53:49, 170.75s/it][AINFO 12-02 21:48:18 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 21:48:18 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 21:48:18 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 21:48:18 [block_pool.py:292] Successfully reset prefix cache
+
+ 47%|████▋     | 479/1024 [21:37:44<26:12:48, 173.15s/it][A
+                                                         [A{'loss': 0.0003, 'grad_norm': 0.0015369568718597293, 'learning_rate': 1e-05, 'num_tokens': 407853501.0, 'completions/mean_length': 7169.71875, 'completions/min_length': 575.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0546875, 'completions/mean_terminated_length': 6636.66064453125, 'completions/min_terminated_length': 575.0, 'completions/max_terminated_length': 15946.0, 'rewards/accuracy_reward/mean': 0.4296875, 'rewards/accuracy_reward/std': 0.4969765841960907, 'reward': 0.4296875, 'reward_std': 0.2012200653553009, 'frac_reward_zero_std': 0.5625, 'sampling/sampling_logp_difference/mean': 0.02142396941781044, 'sampling/sampling_logp_difference/max': 6.187495231628418, 'sampling/importance_sampling_ratio/min': 0.002054967451840639, 'sampling/importance_sampling_ratio/mean': 0.9999596476554871, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9511305093765259, 'clip_ratio/low_mean': 2.1491193592737545e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 2.5839830186669133e-06, 'clip_ratio/high_max': 1.0335932074667653e-05, 'clip_ratio/region_mean': 2.4075176838778134e-05, 'epoch': 0.44}
+
+ 47%|████▋     | 479/1024 [21:37:44<26:12:48, 173.15s/it][AINFO 12-02 21:51:17 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 21:51:17 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 21:51:17 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 21:51:17 [block_pool.py:292] Successfully reset prefix cache
+
+ 47%|████▋     | 480/1024 [21:41:07<27:32:09, 182.22s/it][A
+                                                         [A{'loss': 0.0362, 'grad_norm': 0.0009918162832036614, 'learning_rate': 1e-05, 'num_tokens': 408865482.0, 'completions/mean_length': 7752.1640625, 'completions/min_length': 113.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0546875, 'completions/mean_terminated_length': 7252.80126953125, 'completions/min_terminated_length': 113.0, 'completions/max_terminated_length': 16011.0, 'rewards/accuracy_reward/mean': 0.296875, 'rewards/accuracy_reward/std': 0.45867621898651123, 'reward': 0.296875, 'reward_std': 0.19438526034355164, 'frac_reward_zero_std': 0.5625, 'sampling/sampling_logp_difference/mean': 0.020829707384109497, 'sampling/sampling_logp_difference/max': 9.874916076660156, 'sampling/importance_sampling_ratio/min': 5.144917668076232e-05, 'sampling/importance_sampling_ratio/mean': 0.9999382495880127, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9492243379354477, 'clip_ratio/low_mean': 4.3063334089765704e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 3.3296554988737626e-06, 'clip_ratio/high_max': 1.331862199549505e-05, 'clip_ratio/region_mean': 4.6392989588639466e-05, 'epoch': 0.44}
+
+ 47%|████▋     | 480/1024 [21:41:07<27:32:09, 182.22s/it][AINFO 12-02 21:54:40 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 21:54:40 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 21:54:40 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 21:54:40 [block_pool.py:292] Successfully reset prefix cache
+
+ 47%|████▋     | 481/1024 [21:44:10<27:29:51, 182.31s/it][A
+                                                         [A{'loss': 0.0639, 'grad_norm': 0.002358444035053253, 'learning_rate': 1e-05, 'num_tokens': 409829685.0, 'completions/mean_length': 7391.2734375, 'completions/min_length': 1241.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0390625, 'completions/mean_terminated_length': 7025.71533203125, 'completions/min_terminated_length': 1241.0, 'completions/max_terminated_length': 16078.0, 'rewards/accuracy_reward/mean': 0.4375, 'rewards/accuracy_reward/std': 0.49802759289741516, 'reward': 0.4375, 'reward_std': 0.23934084177017212, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.020990099757909775, 'sampling/sampling_logp_difference/max': 5.337113380432129, 'sampling/importance_sampling_ratio/min': 0.004809734411537647, 'sampling/importance_sampling_ratio/mean': 0.9999746084213257, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.949093371629715, 'clip_ratio/low_mean': 1.931758384898785e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 0.0, 'clip_ratio/high_max': 0.0, 'clip_ratio/region_mean': 1.931758384898785e-05, 'epoch': 0.44}
+
+ 47%|████▋     | 481/1024 [21:44:10<27:29:51, 182.31s/it][AINFO 12-02 21:57:43 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 21:57:43 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 21:57:43 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 21:57:43 [block_pool.py:292] Successfully reset prefix cache
+
+ 47%|████▋     | 482/1024 [21:47:12<27:26:21, 182.25s/it][A
+                                                         [A{'loss': 0.097, 'grad_norm': 0.002431367291137576, 'learning_rate': 1e-05, 'num_tokens': 410874854.0, 'completions/mean_length': 8021.2578125, 'completions/min_length': 494.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 7820.55224609375, 'completions/min_terminated_length': 494.0, 'completions/max_terminated_length': 16319.0, 'rewards/accuracy_reward/mean': 0.5078125, 'rewards/accuracy_reward/std': 0.5019033551216125, 'reward': 0.5078125, 'reward_std': 0.3492894768714905, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.019994009286165237, 'sampling/sampling_logp_difference/max': 9.124897003173828, 'sampling/importance_sampling_ratio/min': 0.00010891998681472614, 'sampling/importance_sampling_ratio/mean': 0.9999185800552368, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8854518681764603, 'clip_ratio/low_mean': 4.140929331697407e-05, 'clip_ratio/low_min': 6.721333647874417e-06, 'clip_ratio/high_mean': 5.647267016684054e-06, 'clip_ratio/high_max': 2.2589068066736218e-05, 'clip_ratio/region_mean': 4.705656021997129e-05, 'epoch': 0.44}
+
+ 47%|████▋     | 482/1024 [21:47:12<27:26:21, 182.25s/it][AINFO 12-02 22:00:45 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 22:00:45 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 22:00:45 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 22:00:45 [block_pool.py:292] Successfully reset prefix cache
+
+ 47%|████▋     | 483/1024 [21:50:07<27:05:23, 180.27s/it][A
+                                                         [A{'loss': 0.0429, 'grad_norm': 0.0013435191940516233, 'learning_rate': 1e-05, 'num_tokens': 411817243.0, 'completions/mean_length': 7209.7265625, 'completions/min_length': 728.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 7137.48828125, 'completions/min_terminated_length': 728.0, 'completions/max_terminated_length': 16376.0, 'rewards/accuracy_reward/mean': 0.5390625, 'rewards/accuracy_reward/std': 0.5004304051399231, 'reward': 0.5390625, 'reward_std': 0.23592591285705566, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.019708827137947083, 'sampling/sampling_logp_difference/max': 9.027420043945312, 'sampling/importance_sampling_ratio/min': 0.00012007186887785792, 'sampling/importance_sampling_ratio/mean': 0.9999951124191284, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8724547103047371, 'clip_ratio/low_mean': 1.7103259438044915e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 1.2817357628591708e-06, 'clip_ratio/high_max': 5.126943051436683e-06, 'clip_ratio/region_mean': 1.8384995200904086e-05, 'epoch': 0.44}
+
+ 47%|████▋     | 483/1024 [21:50:07<27:05:23, 180.27s/it][AINFO 12-02 22:03:40 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 22:03:40 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 22:03:40 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 22:03:40 [block_pool.py:292] Successfully reset prefix cache
diff --git a/dapo_lorafa_20251202_173337/README.md b/dapo_lorafa_20251202_173337/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..d7d436561adcb34f3fb79315361c1356c075bd8e
--- /dev/null
+++ b/dapo_lorafa_20251202_173337/README.md
@@ -0,0 +1,68 @@
+---
+base_model: deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B
+library_name: transformers
+model_name: dapo_lorafa_20251202_173337
+tags:
+- generated_from_trainer
+- trl
+- grpo
+licence: license
+---
+
+# Model Card for dapo_lorafa_20251202_173337
+
+This model is a fine-tuned version of [deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B](https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B).
+It has been trained using [TRL](https://github.com/huggingface/trl).
+
+## Quick start
+
+```python
+from transformers import pipeline
+
+question = "If you had a time machine, but could only go to the past or the future once and never return, which would you choose and why?"
+generator = pipeline("text-generation", model="None", device="cuda")
+output = generator([{"role": "user", "content": question}], max_new_tokens=128, return_full_text=False)[0]
+print(output["generated_text"])
+```
+
+## Training procedure
+
+[<img src="https://raw.githubusercontent.com/wandb/assets/main/wandb-github-badge-28.svg" alt="Visualize in Weights & Biases" width="150" height="24"/>](https://wandb.ai/mikastars-zhejiang-university/Tina/runs/8lt7zamw) 
+
+
+This model was trained with GRPO, a method introduced in [DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models](https://huggingface.co/papers/2402.03300).
+
+### Framework versions
+
+- TRL: 0.25.0
+- Transformers: 4.57.1
+- Pytorch: 2.8.0
+- Datasets: 4.4.1
+- Tokenizers: 0.22.1
+
+## Citations
+
+Cite GRPO as:
+
+```bibtex
+@article{shao2024deepseekmath,
+    title        = {{DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models}},
+    author       = {Zhihong Shao and Peiyi Wang and Qihao Zhu and Runxin Xu and Junxiao Song and Mingchuan Zhang and Y. K. Li and Y. Wu and Daya Guo},
+    year         = 2024,
+    eprint       = {arXiv:2402.03300},
+}
+
+```
+
+Cite TRL as:
+    
+```bibtex
+@misc{vonwerra2022trl,
+	title        = {{TRL: Transformer Reinforcement Learning}},
+	author       = {Leandro von Werra and Younes Belkada and Lewis Tunstall and Edward Beeching and Tristan Thrush and Nathan Lambert and Shengyi Huang and Kashif Rasul and Quentin Gallou{\'e}dec},
+	year         = 2020,
+	journal      = {GitHub repository},
+	publisher    = {GitHub},
+	howpublished = {\url{https://github.com/huggingface/trl}}
+}
+```
\ No newline at end of file
diff --git a/dapo_lorafa_20251202_173337/checkpoint-576/README.md b/dapo_lorafa_20251202_173337/checkpoint-576/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..b3fac4aca7a7fabb3a0972e6c9281e23853e2816
--- /dev/null
+++ b/dapo_lorafa_20251202_173337/checkpoint-576/README.md
@@ -0,0 +1,209 @@
+---
+base_model: deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B
+- grpo
+- lora
+- transformers
+- trl
+---
+
+# Model Card for Model ID
+
+<!-- Provide a quick summary of what the model is/does. -->
+
+
+
+## Model Details
+
+### Model Description
+
+<!-- Provide a longer summary of what this model is. -->
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+<!-- Provide the basic links for the model. -->
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+
+### Direct Use
+
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+
+[More Information Needed]
+
+### Recommendations
+
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+
+[More Information Needed]
+
+### Training Procedure
+
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+
+#### Speeds, Sizes, Times [optional]
+
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+
+[More Information Needed]
+
+## Evaluation
+
+<!-- This section describes the evaluation protocols and provides the results. -->
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+<!-- This should link to a Dataset Card if possible. -->
+
+[More Information Needed]
+
+#### Factors
+
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+
+[More Information Needed]
+
+#### Metrics
+
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+<!-- Relevant interpretability work for the model goes here -->
+
+[More Information Needed]
+
+## Environmental Impact
+
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.17.1
\ No newline at end of file
diff --git a/dapo_lorafa_20251202_173337/checkpoint-576/adapter_config.json b/dapo_lorafa_20251202_173337/checkpoint-576/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..c074229bb0545de98cc4b88111a8b54705fd6f30
--- /dev/null
+++ b/dapo_lorafa_20251202_173337/checkpoint-576/adapter_config.json
@@ -0,0 +1,42 @@
+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B",
+  "bias": "none",
+  "corda_config": null,
+  "eva_config": null,
+  "exclude_modules": null,
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 64,
+  "lora_bias": false,
+  "lora_dropout": 0.05,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "qalora_group_size": 16,
+  "r": 32,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "q_proj",
+    "o_proj",
+    "down_proj",
+    "v_proj",
+    "gate_proj",
+    "up_proj",
+    "k_proj"
+  ],
+  "target_parameters": null,
+  "task_type": "CAUSAL_LM",
+  "trainable_token_indices": null,
+  "use_dora": false,
+  "use_qalora": false,
+  "use_rslora": false
+}
\ No newline at end of file
diff --git a/dapo_lorafa_20251202_173337/checkpoint-576/latest b/dapo_lorafa_20251202_173337/checkpoint-576/latest
new file mode 100644
index 0000000000000000000000000000000000000000..1a40031386820b60f3a54acbdbae4813e4a986c7
--- /dev/null
+++ b/dapo_lorafa_20251202_173337/checkpoint-576/latest
@@ -0,0 +1 @@
+global_step576
\ No newline at end of file
diff --git a/dapo_lorafa_20251202_173337/checkpoint-576/tokenizer_config.json b/dapo_lorafa_20251202_173337/checkpoint-576/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..d252dd4e5764106823080946500c02a8ed8c90c9
--- /dev/null
+++ b/dapo_lorafa_20251202_173337/checkpoint-576/tokenizer_config.json
@@ -0,0 +1,194 @@
+{
+  "add_bos_token": true,
+  "add_eos_token": false,
+  "add_prefix_space": null,
+  "added_tokens_decoder": {
+    "151643": {
+      "content": "<｜end▁of▁sentence｜>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151644": {
+      "content": "<｜User｜>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151645": {
+      "content": "<｜Assistant｜>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151646": {
+      "content": "<｜begin▁of▁sentence｜>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151647": {
+      "content": "<|EOT|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151648": {
+      "content": "<think>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151649": {
+      "content": "</think>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151650": {
+      "content": "<|quad_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151651": {
+      "content": "<|quad_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151652": {
+      "content": "<|vision_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151653": {
+      "content": "<|vision_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151654": {
+      "content": "<|vision_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151655": {
+      "content": "<|image_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151656": {
+      "content": "<|video_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151657": {
+      "content": "<tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151658": {
+      "content": "</tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151659": {
+      "content": "<|fim_prefix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151660": {
+      "content": "<|fim_middle|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151661": {
+      "content": "<|fim_suffix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151662": {
+      "content": "<|fim_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151663": {
+      "content": "<|repo_name|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151664": {
+      "content": "<|file_sep|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    }
+  },
+  "bos_token": "<｜begin▁of▁sentence｜>",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<｜end▁of▁sentence｜>",
+  "extra_special_tokens": {},
+  "legacy": true,
+  "model_max_length": 16384,
+  "pad_token": "<｜end▁of▁sentence｜>",
+  "sp_model_kwargs": {},
+  "tokenizer_class": "LlamaTokenizerFast",
+  "unk_token": null,
+  "use_default_system_prompt": false
+}
diff --git a/dapo_lorafa_20251202_173337/output.log b/dapo_lorafa_20251202_173337/output.log
new file mode 100644
index 0000000000000000000000000000000000000000..114d01ebb27c49dc1857efbb4b639c968fd9b349
--- /dev/null
+++ b/dapo_lorafa_20251202_173337/output.log
@@ -0,0 +1,3293 @@
+W1202 17:34:02.231000 1217291 torch/distributed/run.py:774] 
+W1202 17:34:02.231000 1217291 torch/distributed/run.py:774] *****************************************
+W1202 17:34:02.231000 1217291 torch/distributed/run.py:774] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. 
+W1202 17:34:02.231000 1217291 torch/distributed/run.py:774] *****************************************
+INFO 12-02 17:34:24 [__init__.py:216] Automatically detected platform cuda.
+INFO 12-02 17:34:24 [__init__.py:216] Automatically detected platform cuda.
+INFO 12-02 17:34:24 [__init__.py:216] Automatically detected platform cuda.
+INFO 12-02 17:34:24 [__init__.py:216] Automatically detected platform cuda.
+TrainConfig(common=CommonConfig(seed=42, debug=False), model=ModelConfig(model_name_or_path='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', dtype='bfloat16'), peft=PeftConfig(type='lorafa', use_peft=True, task_type='CAUSAL_LM', r=32, lora_alpha=64, lora_dropout=0.05, total_step=1000, target_modules=['q_proj', 'v_proj', 'k_proj', 'o_proj', 'up_proj', 'down_proj', 'gate_proj']), training=TrainingConfig(learning_rate=1e-05, output_dir='outputs/dapo_lorafa_20251202_173337', run_name='outputs/dapo_lorafa_20251202_173337', resume_from_checkpoint='outputs/train/dapo_lorafa_20251201_161746/checkpoint-512', remove_unused_columns=False, gradient_accumulation_steps=8, num_train_epochs=1, max_completion_length=16384, num_generations=8, max_prompt_length=512, logging_steps=1, save_strategy='steps', save_steps=64, max_steps=1024, use_vllm=True, vllm_mode='colocate', vllm_gpu_memory_utilization=0.4, use_liger_kernel=False, epsilon_high=0.28, lr_scheduler_type='constant', lr_scheduler_kwargs={'min_lr_rate': 0.1}, loss_type='dapo', report_to=['wandb'], beta=0.0, warmup_ratio=0.0, per_device_train_batch_size=4, top_entropy_quantile=1.0), logging=LoggingConfig(trackio_space_id='Open-Tinker/Open-Tinker', trackio_project='grpo-full-qwen3-4b', wandb_project='grpo-full-qwen3-4b'), dataset=DatasetConfig(dataset_name_or_path='open-r1/DAPO-Math-17k-Processed', example_numbers=1000000000))
+[OpenTinker] 2025-12-02 17:34:30,825 - root - INFO - Output directory outputs/dapo_lorafa_20251202_173337 already exists, using it
+TrainConfig(common=CommonConfig(seed=42, debug=False), model=ModelConfig(model_name_or_path='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', dtype='bfloat16'), peft=PeftConfig(type='lorafa', use_peft=True, task_type='CAUSAL_LM', r=32, lora_alpha=64, lora_dropout=0.05, total_step=1000, target_modules=['q_proj', 'v_proj', 'k_proj', 'o_proj', 'up_proj', 'down_proj', 'gate_proj']), training=TrainingConfig(learning_rate=1e-05, output_dir='outputs/dapo_lorafa_20251202_173337', run_name='outputs/dapo_lorafa_20251202_173337', resume_from_checkpoint='outputs/train/dapo_lorafa_20251201_161746/checkpoint-512', remove_unused_columns=False, gradient_accumulation_steps=8, num_train_epochs=1, max_completion_length=16384, num_generations=8, max_prompt_length=512, logging_steps=1, save_strategy='steps', save_steps=64, max_steps=1024, use_vllm=True, vllm_mode='colocate', vllm_gpu_memory_utilization=0.4, use_liger_kernel=False, epsilon_high=0.28, lr_scheduler_type='constant', lr_scheduler_kwargs={'min_lr_rate': 0.1}, loss_type='dapo', report_to=['wandb'], beta=0.0, warmup_ratio=0.0, per_device_train_batch_size=4, top_entropy_quantile=1.0), logging=LoggingConfig(trackio_space_id='Open-Tinker/Open-Tinker', trackio_project='grpo-full-qwen3-4b', wandb_project='grpo-full-qwen3-4b'), dataset=DatasetConfig(dataset_name_or_path='open-r1/DAPO-Math-17k-Processed', example_numbers=1000000000))
+[OpenTinker] 2025-12-02 17:34:30,827 - root - INFO - Output directory outputs/dapo_lorafa_20251202_173337 already exists, using it
+TrainConfig(common=CommonConfig(seed=42, debug=False), model=ModelConfig(model_name_or_path='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', dtype='bfloat16'), peft=PeftConfig(type='lorafa', use_peft=True, task_type='CAUSAL_LM', r=32, lora_alpha=64, lora_dropout=0.05, total_step=1000, target_modules=['q_proj', 'v_proj', 'k_proj', 'o_proj', 'up_proj', 'down_proj', 'gate_proj']), training=TrainingConfig(learning_rate=1e-05, output_dir='outputs/dapo_lorafa_20251202_173337', run_name='outputs/dapo_lorafa_20251202_173337', resume_from_checkpoint='outputs/train/dapo_lorafa_20251201_161746/checkpoint-512', remove_unused_columns=False, gradient_accumulation_steps=8, num_train_epochs=1, max_completion_length=16384, num_generations=8, max_prompt_length=512, logging_steps=1, save_strategy='steps', save_steps=64, max_steps=1024, use_vllm=True, vllm_mode='colocate', vllm_gpu_memory_utilization=0.4, use_liger_kernel=False, epsilon_high=0.28, lr_scheduler_type='constant', lr_scheduler_kwargs={'min_lr_rate': 0.1}, loss_type='dapo', report_to=['wandb'], beta=0.0, warmup_ratio=0.0, per_device_train_batch_size=4, top_entropy_quantile=1.0), logging=LoggingConfig(trackio_space_id='Open-Tinker/Open-Tinker', trackio_project='grpo-full-qwen3-4b', wandb_project='grpo-full-qwen3-4b'), dataset=DatasetConfig(dataset_name_or_path='open-r1/DAPO-Math-17k-Processed', example_numbers=1000000000))
+[OpenTinker] 2025-12-02 17:34:30,831 - root - INFO - Output directory outputs/dapo_lorafa_20251202_173337 already exists, using it
+TrainConfig(common=CommonConfig(seed=42, debug=False), model=ModelConfig(model_name_or_path='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', dtype='bfloat16'), peft=PeftConfig(type='lorafa', use_peft=True, task_type='CAUSAL_LM', r=32, lora_alpha=64, lora_dropout=0.05, total_step=1000, target_modules=['q_proj', 'v_proj', 'k_proj', 'o_proj', 'up_proj', 'down_proj', 'gate_proj']), training=TrainingConfig(learning_rate=1e-05, output_dir='outputs/dapo_lorafa_20251202_173337', run_name='outputs/dapo_lorafa_20251202_173337', resume_from_checkpoint='outputs/train/dapo_lorafa_20251201_161746/checkpoint-512', remove_unused_columns=False, gradient_accumulation_steps=8, num_train_epochs=1, max_completion_length=16384, num_generations=8, max_prompt_length=512, logging_steps=1, save_strategy='steps', save_steps=64, max_steps=1024, use_vllm=True, vllm_mode='colocate', vllm_gpu_memory_utilization=0.4, use_liger_kernel=False, epsilon_high=0.28, lr_scheduler_type='constant', lr_scheduler_kwargs={'min_lr_rate': 0.1}, loss_type='dapo', report_to=['wandb'], beta=0.0, warmup_ratio=0.0, per_device_train_batch_size=4, top_entropy_quantile=1.0), logging=LoggingConfig(trackio_space_id='Open-Tinker/Open-Tinker', trackio_project='grpo-full-qwen3-4b', wandb_project='grpo-full-qwen3-4b'), dataset=DatasetConfig(dataset_name_or_path='open-r1/DAPO-Math-17k-Processed', example_numbers=1000000000))
+[OpenTinker] 2025-12-02 17:34:30,836 - root - INFO - Output directory outputs/dapo_lorafa_20251202_173337 already exists, using it
+wandb: Currently logged in as: mikastars (mikastars-zhejiang-university) to https://api.wandb.ai. Use `wandb login --relogin` to force relogin
+wandb: Currently logged in as: mikastars (mikastars-zhejiang-university) to https://api.wandb.ai. Use `wandb login --relogin` to force relogin
+wandb: Currently logged in as: mikastars (mikastars-zhejiang-university) to https://api.wandb.ai. Use `wandb login --relogin` to force relogin
+wandb: Currently logged in as: mikastars (mikastars-zhejiang-university) to https://api.wandb.ai. Use `wandb login --relogin` to force relogin
+wandb: setting up run 8lt7zamw
+wandb: setting up run t4d1xrj2
+wandb: setting up run hpd46kjr
+wandb: setting up run qkisy38r
+wandb: Tracking run with wandb version 0.22.3
+wandb: Run data is saved locally in /mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/wandb/run-20251202_173434-8lt7zamw
+wandb: Run `wandb offline` to turn off syncing.
+wandb: Syncing run outputs/dapo_lorafa_20251202_173337
+wandb: ⭐️ View project at https://wandb.ai/mikastars-zhejiang-university/Tina
+wandb: 🚀 View run at https://wandb.ai/mikastars-zhejiang-university/Tina/runs/8lt7zamw
+wandb: Tracking run with wandb version 0.22.3
+wandb: Run data is saved locally in /mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/wandb/run-20251202_173434-qkisy38r
+wandb: Run `wandb offline` to turn off syncing.
+wandb: Syncing run outputs/dapo_lorafa_20251202_173337
+wandb: ⭐️ View project at https://wandb.ai/mikastars-zhejiang-university/Tina
+wandb: 🚀 View run at https://wandb.ai/mikastars-zhejiang-university/Tina/runs/qkisy38r
+wandb: Tracking run with wandb version 0.22.3
+wandb: Run data is saved locally in /mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/wandb/run-20251202_173434-t4d1xrj2
+wandb: Run `wandb offline` to turn off syncing.
+wandb: Syncing run outputs/dapo_lorafa_20251202_173337
+wandb: ⭐️ View project at https://wandb.ai/mikastars-zhejiang-university/Tina
+wandb: 🚀 View run at https://wandb.ai/mikastars-zhejiang-university/Tina/runs/t4d1xrj2
+wandb: Tracking run with wandb version 0.22.3
+wandb: Run data is saved locally in /mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/wandb/run-20251202_173434-hpd46kjr
+wandb: Run `wandb offline` to turn off syncing.
+wandb: Syncing run outputs/dapo_lorafa_20251202_173337
+wandb: ⭐️ View project at https://wandb.ai/mikastars-zhejiang-university/Tina
+wandb: 🚀 View run at https://wandb.ai/mikastars-zhejiang-university/Tina/runs/hpd46kjr
+wandb: Detected [huggingface_hub.inference, openai] in use.
+wandb: Use W&B Weave for improved LLM call tracing. Install Weave with `pip install weave` then add `import weave` to the top of your script.
+wandb: Detected [huggingface_hub.inference, openai] in use.
+wandb: Detected [huggingface_hub.inference, openai] in use.
+wandb: Use W&B Weave for improved LLM call tracing. Install Weave with `pip install weave` then add `import weave` to the top of your script.
+wandb: For more information, check out the docs at: https://weave-docs.wandb.ai/
+[OpenTinker] 2025-12-02 17:34:36,641 - root - INFO - Wandb initialized successfully
+wandb: For more information, check out the docs at: https://weave-docs.wandb.ai/
+[OpenTinker] 2025-12-02 17:34:36,641 - root - INFO - Loading tokenizer from deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B
+wandb: Use W&B Weave for improved LLM call tracing. Install Weave with `pip install weave` then add `import weave` to the top of your script.
+[OpenTinker] 2025-12-02 17:34:36,641 - root - INFO - Wandb initialized successfully
+[OpenTinker] 2025-12-02 17:34:36,641 - root - INFO - Loading tokenizer from deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B
+wandb: For more information, check out the docs at: https://weave-docs.wandb.ai/
+[OpenTinker] 2025-12-02 17:34:36,641 - root - INFO - Wandb initialized successfully
+[OpenTinker] 2025-12-02 17:34:36,642 - root - INFO - Loading tokenizer from deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B
+wandb: Detected [huggingface_hub.inference, openai] in use.
+wandb: Use W&B Weave for improved LLM call tracing. Install Weave with `pip install weave` then add `import weave` to the top of your script.
+wandb: For more information, check out the docs at: https://weave-docs.wandb.ai/
+[OpenTinker] 2025-12-02 17:34:36,765 - root - INFO - Wandb initialized successfully
+[OpenTinker] 2025-12-02 17:34:36,766 - root - INFO - Loading tokenizer from deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B
+[OpenTinker] 2025-12-02 17:34:37,986 - root - INFO - Loading dataset from open-r1/DAPO-Math-17k-Processed
+[OpenTinker] 2025-12-02 17:34:38,089 - root - INFO - Loading dataset from open-r1/DAPO-Math-17k-Processed
+[OpenTinker] 2025-12-02 17:34:39,180 - root - INFO - Loading dataset from open-r1/DAPO-Math-17k-Processed
+[OpenTinker] 2025-12-02 17:34:39,449 - root - INFO - Loading dataset from open-r1/DAPO-Math-17k-Processed
+[OpenTinker] 2025-12-02 17:34:41,016 - root - INFO - Loading model from deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B
+[OpenTinker] 2025-12-02 17:34:41,112 - root - INFO - Loading model from deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B
+`torch_dtype` is deprecated! Use `dtype` instead!
+`torch_dtype` is deprecated! Use `dtype` instead!
+[OpenTinker] 2025-12-02 17:34:42,227 - root - INFO - Loading model from deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B
+[OpenTinker] 2025-12-02 17:34:42,438 - root - INFO - Loading model from deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B
+[OpenTinker] 2025-12-02 17:34:42,554 - root - INFO - Model loaded successfully
+[OpenTinker] 2025-12-02 17:34:42,554 - root - INFO - Detected PEFT configuration, configuring lora
+[OpenTinker] 2025-12-02 17:34:42,555 - root - INFO - Model loaded successfully
+[OpenTinker] 2025-12-02 17:34:42,555 - root - INFO - Detected PEFT configuration, configuring lora
+`torch_dtype` is deprecated! Use `dtype` instead!
+`torch_dtype` is deprecated! Use `dtype` instead!
+[OpenTinker] 2025-12-02 17:34:43,055 - root - INFO - Lora configured successfully
+[OpenTinker] 2025-12-02 17:34:43,055 - root - INFO - Lora configured successfully
+[OpenTinker] 2025-12-02 17:34:43,554 - root - INFO - gcc -pthread -B /root/miniconda3/compiler_compat -DNDEBUG -fwrapv -O2 -Wall -fPIC -O2 -isystem /root/miniconda3/include -fPIC -O2 -isystem /root/miniconda3/include -fPIC -c /tmp/tmp3lmny6fn/test.c -o /tmp/tmp3lmny6fn/test.o
+[OpenTinker] 2025-12-02 17:34:43,562 - root - INFO - Model loaded successfully
+[OpenTinker] 2025-12-02 17:34:43,568 - root - INFO - Detected PEFT configuration, configuring lora
+[OpenTinker] 2025-12-02 17:34:43,568 - root - INFO - gcc -pthread -B /root/miniconda3/compiler_compat -DNDEBUG -fwrapv -O2 -Wall -fPIC -O2 -isystem /root/miniconda3/include -fPIC -O2 -isystem /root/miniconda3/include -fPIC -c /tmp/tmp1e_h4zes/test.c -o /tmp/tmp1e_h4zes/test.o
+[OpenTinker] 2025-12-02 17:34:43,592 - root - INFO - Model loaded successfully
+[OpenTinker] 2025-12-02 17:34:43,597 - root - INFO - Detected PEFT configuration, configuring lora
+[OpenTinker] 2025-12-02 17:34:43,618 - root - INFO - gcc -pthread -B /root/miniconda3/compiler_compat /tmp/tmp3lmny6fn/test.o -laio -o /tmp/tmp3lmny6fn/a.out
+[OpenTinker] 2025-12-02 17:34:43,632 - root - INFO - gcc -pthread -B /root/miniconda3/compiler_compat /tmp/tmp1e_h4zes/test.o -laio -o /tmp/tmp1e_h4zes/a.out
+[OpenTinker] 2025-12-02 17:34:43,943 - root - INFO - Lora configured successfully
+[OpenTinker] 2025-12-02 17:34:43,976 - root - INFO - Lora configured successfully
+[OpenTinker] 2025-12-02 17:34:44,097 - root - INFO - gcc -pthread -B /root/miniconda3/compiler_compat -DNDEBUG -fwrapv -O2 -Wall -fPIC -O2 -isystem /root/miniconda3/include -fPIC -O2 -isystem /root/miniconda3/include -fPIC -c /tmp/tmp0gl4wseb/test.c -o /tmp/tmp0gl4wseb/test.o
+[OpenTinker] 2025-12-02 17:34:44,110 - root - INFO - gcc -pthread -B /root/miniconda3/compiler_compat -DNDEBUG -fwrapv -O2 -Wall -fPIC -O2 -isystem /root/miniconda3/include -fPIC -O2 -isystem /root/miniconda3/include -fPIC -c /tmp/tmpbe2hoe5y/test.c -o /tmp/tmpbe2hoe5y/test.o
+[OpenTinker] 2025-12-02 17:34:44,127 - root - INFO - gcc -pthread -B /root/miniconda3/compiler_compat /tmp/tmp0gl4wseb/test.o -L/usr/local/cuda -L/usr/local/cuda/lib64 -lcufile -o /tmp/tmp0gl4wseb/a.out
+[OpenTinker] 2025-12-02 17:34:44,144 - root - INFO - gcc -pthread -B /root/miniconda3/compiler_compat /tmp/tmpbe2hoe5y/test.o -L/usr/local/cuda -L/usr/local/cuda/lib64 -lcufile -o /tmp/tmpbe2hoe5y/a.out
+[OpenTinker] 2025-12-02 17:34:44,259 - root - INFO - gcc -pthread -B /root/miniconda3/compiler_compat -DNDEBUG -fwrapv -O2 -Wall -fPIC -O2 -isystem /root/miniconda3/include -fPIC -O2 -isystem /root/miniconda3/include -fPIC -c /tmp/tmpj4z5ww_7/test.c -o /tmp/tmpj4z5ww_7/test.o
+[OpenTinker] 2025-12-02 17:34:44,282 - root - INFO - gcc -pthread -B /root/miniconda3/compiler_compat -DNDEBUG -fwrapv -O2 -Wall -fPIC -O2 -isystem /root/miniconda3/include -fPIC -O2 -isystem /root/miniconda3/include -fPIC -c /tmp/tmpm82vly95/test.c -o /tmp/tmpm82vly95/test.o
+[OpenTinker] 2025-12-02 17:34:44,306 - root - INFO - gcc -pthread -B /root/miniconda3/compiler_compat /tmp/tmpj4z5ww_7/test.o -laio -o /tmp/tmpj4z5ww_7/a.out
+[OpenTinker] 2025-12-02 17:34:44,323 - root - INFO - gcc -pthread -B /root/miniconda3/compiler_compat /tmp/tmpm82vly95/test.o -laio -o /tmp/tmpm82vly95/a.out
+[OpenTinker] 2025-12-02 17:34:44,750 - root - INFO - gcc -pthread -B /root/miniconda3/compiler_compat -DNDEBUG -fwrapv -O2 -Wall -fPIC -O2 -isystem /root/miniconda3/include -fPIC -O2 -isystem /root/miniconda3/include -fPIC -c /tmp/tmp0uudj5q4/test.c -o /tmp/tmp0uudj5q4/test.o
+[OpenTinker] 2025-12-02 17:34:44,775 - root - INFO - gcc -pthread -B /root/miniconda3/compiler_compat /tmp/tmp0uudj5q4/test.o -L/usr/local/cuda -L/usr/local/cuda/lib64 -lcufile -o /tmp/tmp0uudj5q4/a.out
+[OpenTinker] 2025-12-02 17:34:44,792 - root - INFO - gcc -pthread -B /root/miniconda3/compiler_compat -DNDEBUG -fwrapv -O2 -Wall -fPIC -O2 -isystem /root/miniconda3/include -fPIC -O2 -isystem /root/miniconda3/include -fPIC -c /tmp/tmpojuli1hs/test.c -o /tmp/tmpojuli1hs/test.o
+[OpenTinker] 2025-12-02 17:34:44,817 - root - INFO - gcc -pthread -B /root/miniconda3/compiler_compat /tmp/tmpojuli1hs/test.o -L/usr/local/cuda -L/usr/local/cuda/lib64 -lcufile -o /tmp/tmpojuli1hs/a.out
+lshn-qs-pjul-8:1217537:1217537 [0] NCCL INFO NCCL_SOCKET_IFNAME set by environment to eth0
+lshn-qs-pjul-8:1217537:1217537 [0] NCCL INFO Bootstrap: Using eth0:10.146.225.173<0>
+lshn-qs-pjul-8:1217537:1217537 [0] NCCL INFO cudaDriverVersion 12090
+lshn-qs-pjul-8:1217537:1217537 [0] NCCL INFO NCCL version 2.27.3+cuda12.9
+lshn-qs-pjul-8:1217537:1217537 [0] NCCL INFO Comm config Blocking set to 1
+lshn-qs-pjul-8:1217539:1217539 [2] NCCL INFO cudaDriverVersion 12090
+lshn-qs-pjul-8:1217539:1217539 [2] NCCL INFO NCCL_SOCKET_IFNAME set by environment to eth0
+lshn-qs-pjul-8:1217539:1217539 [2] NCCL INFO Bootstrap: Using eth0:10.146.225.173<0>
+lshn-qs-pjul-8:1217539:1217539 [2] NCCL INFO NCCL version 2.27.3+cuda12.9
+lshn-qs-pjul-8:1217538:1217538 [1] NCCL INFO cudaDriverVersion 12090
+lshn-qs-pjul-8:1217538:1217538 [1] NCCL INFO NCCL_SOCKET_IFNAME set by environment to eth0
+lshn-qs-pjul-8:1217539:1217539 [2] NCCL INFO Comm config Blocking set to 1
+lshn-qs-pjul-8:1217538:1217538 [1] NCCL INFO Bootstrap: Using eth0:10.146.225.173<0>
+lshn-qs-pjul-8:1217538:1217538 [1] NCCL INFO NCCL version 2.27.3+cuda12.9
+lshn-qs-pjul-8:1217540:1217540 [3] NCCL INFO cudaDriverVersion 12090
+lshn-qs-pjul-8:1217540:1217540 [3] NCCL INFO NCCL_SOCKET_IFNAME set by environment to eth0
+lshn-qs-pjul-8:1217538:1217538 [1] NCCL INFO Comm config Blocking set to 1
+lshn-qs-pjul-8:1217540:1217540 [3] NCCL INFO Bootstrap: Using eth0:10.146.225.173<0>
+lshn-qs-pjul-8:1217540:1217540 [3] NCCL INFO NCCL version 2.27.3+cuda12.9
+lshn-qs-pjul-8:1217540:1217540 [3] NCCL INFO Comm config Blocking set to 1
+lshn-qs-pjul-8:1217537:1218633 [0] NCCL INFO NET/Plugin: Could not find: libnccl-net-none.so. 
+lshn-qs-pjul-8:1217537:1218633 [0] NCCL INFO NCCL_IB_DISABLE set by environment to 1.
+lshn-qs-pjul-8:1217537:1218633 [0] NCCL INFO NCCL_SOCKET_IFNAME set by environment to eth0
+lshn-qs-pjul-8:1217537:1218633 [0] NCCL INFO NET/Socket : Using [0]eth0:10.146.225.173<0>
+lshn-qs-pjul-8:1217537:1218633 [0] NCCL INFO Initialized NET plugin Socket
+lshn-qs-pjul-8:1217537:1218633 [0] NCCL INFO Assigned NET plugin Socket to comm
+lshn-qs-pjul-8:1217537:1218633 [0] NCCL INFO Using network Socket
+lshn-qs-pjul-8:1217539:1218634 [2] NCCL INFO NET/Plugin: Could not find: libnccl-net-none.so. 
+lshn-qs-pjul-8:1217539:1218634 [2] NCCL INFO NCCL_IB_DISABLE set by environment to 1.
+lshn-qs-pjul-8:1217539:1218634 [2] NCCL INFO NCCL_SOCKET_IFNAME set by environment to eth0
+lshn-qs-pjul-8:1217539:1218634 [2] NCCL INFO NET/Socket : Using [0]eth0:10.146.225.173<0>
+lshn-qs-pjul-8:1217539:1218634 [2] NCCL INFO Initialized NET plugin Socket
+lshn-qs-pjul-8:1217539:1218634 [2] NCCL INFO Assigned NET plugin Socket to comm
+lshn-qs-pjul-8:1217539:1218634 [2] NCCL INFO Using network Socket
+lshn-qs-pjul-8:1217540:1218636 [3] NCCL INFO NET/Plugin: Could not find: libnccl-net-none.so. 
+lshn-qs-pjul-8:1217540:1218636 [3] NCCL INFO NCCL_IB_DISABLE set by environment to 1.
+lshn-qs-pjul-8:1217540:1218636 [3] NCCL INFO NCCL_SOCKET_IFNAME set by environment to eth0
+lshn-qs-pjul-8:1217540:1218636 [3] NCCL INFO NET/Socket : Using [0]eth0:10.146.225.173<0>
+lshn-qs-pjul-8:1217540:1218636 [3] NCCL INFO Initialized NET plugin Socket
+lshn-qs-pjul-8:1217540:1218636 [3] NCCL INFO Assigned NET plugin Socket to comm
+lshn-qs-pjul-8:1217540:1218636 [3] NCCL INFO Using network Socket
+lshn-qs-pjul-8:1217538:1218635 [1] NCCL INFO NET/Plugin: Could not find: libnccl-net-none.so. 
+lshn-qs-pjul-8:1217538:1218635 [1] NCCL INFO NCCL_IB_DISABLE set by environment to 1.
+lshn-qs-pjul-8:1217538:1218635 [1] NCCL INFO NCCL_SOCKET_IFNAME set by environment to eth0
+lshn-qs-pjul-8:1217538:1218635 [1] NCCL INFO NET/Socket : Using [0]eth0:10.146.225.173<0>
+lshn-qs-pjul-8:1217538:1218635 [1] NCCL INFO Initialized NET plugin Socket
+lshn-qs-pjul-8:1217538:1218635 [1] NCCL INFO Assigned NET plugin Socket to comm
+lshn-qs-pjul-8:1217538:1218635 [1] NCCL INFO Using network Socket
+lshn-qs-pjul-8:1217537:1218633 [0] NCCL INFO ncclCommInitRankConfig comm 0x1ce20510 rank 0 nranks 4 cudaDev 0 nvmlDev 4 busId 109000 commId 0x5b31249cba627096 - Init START
+lshn-qs-pjul-8:1217540:1218636 [3] NCCL INFO ncclCommInitRankConfig comm 0x1f12a6e0 rank 3 nranks 4 cudaDev 3 nvmlDev 7 busId 1c7000 commId 0x5b31249cba627096 - Init START
+lshn-qs-pjul-8:1217538:1218635 [1] NCCL INFO ncclCommInitRankConfig comm 0x1ed808d0 rank 1 nranks 4 cudaDev 1 nvmlDev 5 busId 17f000 commId 0x5b31249cba627096 - Init START
+lshn-qs-pjul-8:1217539:1218634 [2] NCCL INFO ncclCommInitRankConfig comm 0x1dced410 rank 2 nranks 4 cudaDev 2 nvmlDev 6 busId 1a3000 commId 0x5b31249cba627096 - Init START
+lshn-qs-pjul-8:1217540:1218636 [3] NCCL INFO RAS client listening socket at ::1<28028>
+lshn-qs-pjul-8:1217539:1218634 [2] NCCL INFO RAS client listening socket at ::1<28028>
+lshn-qs-pjul-8:1217538:1218635 [1] NCCL INFO RAS client listening socket at ::1<28028>
+lshn-qs-pjul-8:1217537:1218633 [0] NCCL INFO RAS client listening socket at ::1<28028>
+lshn-qs-pjul-8:1217538:1218635 [1] NCCL INFO Bootstrap timings total 0.001767 (create 0.000018, send 0.000086, recv 0.000183, ring 0.000072, delay 0.000000)
+lshn-qs-pjul-8:1217539:1218634 [2] NCCL INFO Bootstrap timings total 0.001716 (create 0.000022, send 0.000094, recv 0.000175, ring 0.001058, delay 0.000000)
+lshn-qs-pjul-8:1217537:1218633 [0] NCCL INFO Bootstrap timings total 0.026047 (create 0.000019, send 0.000085, recv 0.025513, ring 0.000075, delay 0.000001)
+lshn-qs-pjul-8:1217540:1218636 [3] NCCL INFO Bootstrap timings total 0.002694 (create 0.000020, send 0.000107, recv 0.000116, ring 0.001088, delay 0.000001)
+lshn-qs-pjul-8:1217540:1218636 [3] NCCL INFO MNNVL busId 0x1c7000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0
+lshn-qs-pjul-8:1217538:1218635 [1] NCCL INFO MNNVL busId 0x17f000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0
+lshn-qs-pjul-8:1217537:1218633 [0] NCCL INFO MNNVL busId 0x109000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0
+lshn-qs-pjul-8:1217539:1218634 [2] NCCL INFO MNNVL busId 0x1a3000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0
+lshn-qs-pjul-8:1217540:1218636 [3] NCCL INFO Setting affinity for GPU 7 to 48-95,144-191
+lshn-qs-pjul-8:1217540:1218636 [3] NCCL INFO NCCL_NVLS_ENABLE set by environment to 0.
+lshn-qs-pjul-8:1217539:1218634 [2] NCCL INFO Setting affinity for GPU 6 to 48-95,144-191
+lshn-qs-pjul-8:1217539:1218634 [2] NCCL INFO NCCL_NVLS_ENABLE set by environment to 0.
+lshn-qs-pjul-8:1217538:1218635 [1] NCCL INFO Setting affinity for GPU 5 to 48-95,144-191
+lshn-qs-pjul-8:1217538:1218635 [1] NCCL INFO NCCL_NVLS_ENABLE set by environment to 0.
+lshn-qs-pjul-8:1217537:1218633 [0] NCCL INFO Setting affinity for GPU 4 to 48-95,144-191
+lshn-qs-pjul-8:1217537:1218633 [0] NCCL INFO NCCL_NVLS_ENABLE set by environment to 0.
+lshn-qs-pjul-8:1217540:1218636 [3] NCCL INFO comm 0x1f12a6e0 rank 3 nRanks 4 nNodes 1 localRanks 4 localRank 3 MNNVL 0
+lshn-qs-pjul-8:1217537:1218633 [0] NCCL INFO comm 0x1ce20510 rank 0 nRanks 4 nNodes 1 localRanks 4 localRank 0 MNNVL 0
+lshn-qs-pjul-8:1217539:1218634 [2] NCCL INFO comm 0x1dced410 rank 2 nRanks 4 nNodes 1 localRanks 4 localRank 2 MNNVL 0
+lshn-qs-pjul-8:1217538:1218635 [1] NCCL INFO comm 0x1ed808d0 rank 1 nRanks 4 nNodes 1 localRanks 4 localRank 1 MNNVL 0
+lshn-qs-pjul-8:1217540:1218636 [3] NCCL INFO NCCL_MIN_NCHANNELS set by environment to 4.
+lshn-qs-pjul-8:1217537:1218633 [0] NCCL INFO NCCL_MIN_NCHANNELS set by environment to 4.
+lshn-qs-pjul-8:1217539:1218634 [2] NCCL INFO NCCL_MIN_NCHANNELS set by environment to 4.
+lshn-qs-pjul-8:1217538:1218635 [1] NCCL INFO NCCL_MIN_NCHANNELS set by environment to 4.
+lshn-qs-pjul-8:1217540:1218636 [3] NCCL INFO Trees [0] -1/-1/-1->3->2 [1] -1/-1/-1->3->2 [2] -1/-1/-1->3->2 [3] -1/-1/-1->3->2 [4] -1/-1/-1->3->2 [5] -1/-1/-1->3->2 [6] -1/-1/-1->3->2 [7] -1/-1/-1->3->2 [8] -1/-1/-1->3->2 [9] -1/-1/-1->3->2 [10] -1/-1/-1->3->2 [11] -1/-1/-1->3->2 [12] -1/-1/-1->3->2 [13] -1/-1/-1->3->2 [14] -1/-1/-1->3->2 [15] -1/-1/-1->3->2 [16] -1/-1/-1->3->2 [17] -1/-1/-1->3->2 [18] -1/-1/-1->3->2 [19] -1/-1/-1->3->2 [20] -1/-1/-1->3->2 [21] -1/-1/-1->3->2 [22] -1/-1/-1->3->2 [23] -1/-1/-1->3->2
+lshn-qs-pjul-8:1217540:1218636 [3] NCCL INFO P2P Chunksize set to 524288
+lshn-qs-pjul-8:1217537:1218633 [0] NCCL INFO Channel 00/24 : 0 1 2 3
+lshn-qs-pjul-8:1217539:1218634 [2] NCCL INFO Trees [0] 3/-1/-1->2->1 [1] 3/-1/-1->2->1 [2] 3/-1/-1->2->1 [3] 3/-1/-1->2->1 [4] 3/-1/-1->2->1 [5] 3/-1/-1->2->1 [6] 3/-1/-1->2->1 [7] 3/-1/-1->2->1 [8] 3/-1/-1->2->1 [9] 3/-1/-1->2->1 [10] 3/-1/-1->2->1 [11] 3/-1/-1->2->1 [12] 3/-1/-1->2->1 [13] 3/-1/-1->2->1 [14] 3/-1/-1->2->1 [15] 3/-1/-1->2->1 [16] 3/-1/-1->2->1 [17] 3/-1/-1->2->1 [18] 3/-1/-1->2->1 [19] 3/-1/-1->2->1 [20] 3/-1/-1->2->1 [21] 3/-1/-1->2->1 [22] 3/-1/-1->2->1 [23] 3/-1/-1->2->1
+lshn-qs-pjul-8:1217538:1218635 [1] NCCL INFO Trees [0] 2/-1/-1->1->0 [1] 2/-1/-1->1->0 [2] 2/-1/-1->1->0 [3] 2/-1/-1->1->0 [4] 2/-1/-1->1->0 [5] 2/-1/-1->1->0 [6] 2/-1/-1->1->0 [7] 2/-1/-1->1->0 [8] 2/-1/-1->1->0 [9] 2/-1/-1->1->0 [10] 2/-1/-1->1->0 [11] 2/-1/-1->1->0 [12] 2/-1/-1->1->0 [13] 2/-1/-1->1->0 [14] 2/-1/-1->1->0 [15] 2/-1/-1->1->0 [16] 2/-1/-1->1->0 [17] 2/-1/-1->1->0 [18] 2/-1/-1->1->0 [19] 2/-1/-1->1->0 [20] 2/-1/-1->1->0 [21] 2/-1/-1->1->0 [22] 2/-1/-1->1->0 [23] 2/-1/-1->1->0
+lshn-qs-pjul-8:1217537:1218633 [0] NCCL INFO Channel 01/24 : 0 1 2 3
+lshn-qs-pjul-8:1217539:1218634 [2] NCCL INFO P2P Chunksize set to 524288
+lshn-qs-pjul-8:1217538:1218635 [1] NCCL INFO P2P Chunksize set to 524288
+lshn-qs-pjul-8:1217537:1218633 [0] NCCL INFO Channel 02/24 : 0 1 2 3
+lshn-qs-pjul-8:1217537:1218633 [0] NCCL INFO Channel 03/24 : 0 1 2 3
+lshn-qs-pjul-8:1217537:1218633 [0] NCCL INFO Channel 04/24 : 0 1 2 3
+lshn-qs-pjul-8:1217537:1218633 [0] NCCL INFO Channel 05/24 : 0 1 2 3
+lshn-qs-pjul-8:1217537:1218633 [0] NCCL INFO Channel 06/24 : 0 1 2 3
+lshn-qs-pjul-8:1217537:1218633 [0] NCCL INFO Channel 07/24 : 0 1 2 3
+lshn-qs-pjul-8:1217537:1218633 [0] NCCL INFO Channel 08/24 : 0 1 2 3
+lshn-qs-pjul-8:1217537:1218633 [0] NCCL INFO Channel 09/24 : 0 1 2 3
+lshn-qs-pjul-8:1217537:1218633 [0] NCCL INFO Channel 10/24 : 0 1 2 3
+lshn-qs-pjul-8:1217537:1218633 [0] NCCL INFO Channel 11/24 : 0 1 2 3
+lshn-qs-pjul-8:1217537:1218633 [0] NCCL INFO Channel 12/24 : 0 1 2 3
+lshn-qs-pjul-8:1217537:1218633 [0] NCCL INFO Channel 13/24 : 0 1 2 3
+lshn-qs-pjul-8:1217537:1218633 [0] NCCL INFO Channel 14/24 : 0 1 2 3
+lshn-qs-pjul-8:1217537:1218633 [0] NCCL INFO Channel 15/24 : 0 1 2 3
+lshn-qs-pjul-8:1217537:1218633 [0] NCCL INFO Channel 16/24 : 0 1 2 3
+lshn-qs-pjul-8:1217537:1218633 [0] NCCL INFO Channel 17/24 : 0 1 2 3
+lshn-qs-pjul-8:1217537:1218633 [0] NCCL INFO Channel 18/24 : 0 1 2 3
+lshn-qs-pjul-8:1217537:1218633 [0] NCCL INFO Channel 19/24 : 0 1 2 3
+lshn-qs-pjul-8:1217537:1218633 [0] NCCL INFO Channel 20/24 : 0 1 2 3
+lshn-qs-pjul-8:1217537:1218633 [0] NCCL INFO Channel 21/24 : 0 1 2 3
+lshn-qs-pjul-8:1217537:1218633 [0] NCCL INFO Channel 22/24 : 0 1 2 3
+lshn-qs-pjul-8:1217537:1218633 [0] NCCL INFO Channel 23/24 : 0 1 2 3
+lshn-qs-pjul-8:1217537:1218633 [0] NCCL INFO Trees [0] 1/-1/-1->0->-1 [1] 1/-1/-1->0->-1 [2] 1/-1/-1->0->-1 [3] 1/-1/-1->0->-1 [4] 1/-1/-1->0->-1 [5] 1/-1/-1->0->-1 [6] 1/-1/-1->0->-1 [7] 1/-1/-1->0->-1 [8] 1/-1/-1->0->-1 [9] 1/-1/-1->0->-1 [10] 1/-1/-1->0->-1 [11] 1/-1/-1->0->-1 [12] 1/-1/-1->0->-1 [13] 1/-1/-1->0->-1 [14] 1/-1/-1->0->-1 [15] 1/-1/-1->0->-1 [16] 1/-1/-1->0->-1 [17] 1/-1/-1->0->-1 [18] 1/-1/-1->0->-1 [19] 1/-1/-1->0->-1 [20] 1/-1/-1->0->-1 [21] 1/-1/-1->0->-1 [22] 1/-1/-1->0->-1 [23] 1/-1/-1->0->-1
+lshn-qs-pjul-8:1217537:1218633 [0] NCCL INFO P2P Chunksize set to 524288
+lshn-qs-pjul-8:1217539:1218634 [2] NCCL INFO PROFILER/Plugin: Could not find: libnccl-profiler.so. 
+lshn-qs-pjul-8:1217539:1218641 [2] NCCL INFO [Proxy Service] Device 2 CPU core 162
+lshn-qs-pjul-8:1217539:1218642 [2] NCCL INFO [Proxy Service UDS] Device 2 CPU core 175
+lshn-qs-pjul-8:1217540:1218636 [3] NCCL INFO PROFILER/Plugin: Could not find: libnccl-profiler.so. 
+lshn-qs-pjul-8:1217540:1218643 [3] NCCL INFO [Proxy Service] Device 3 CPU core 167
+lshn-qs-pjul-8:1217540:1218644 [3] NCCL INFO [Proxy Service UDS] Device 3 CPU core 81
+lshn-qs-pjul-8:1217538:1218635 [1] NCCL INFO PROFILER/Plugin: Could not find: libnccl-profiler.so. 
+lshn-qs-pjul-8:1217538:1218645 [1] NCCL INFO [Proxy Service] Device 1 CPU core 68
+lshn-qs-pjul-8:1217538:1218646 [1] NCCL INFO [Proxy Service UDS] Device 1 CPU core 165
+lshn-qs-pjul-8:1217537:1218633 [0] NCCL INFO PROFILER/Plugin: Could not find: libnccl-profiler.so. 
+lshn-qs-pjul-8:1217537:1218633 [0] NCCL INFO Check P2P Type isAllDirectP2p 1 directMode 0
+lshn-qs-pjul-8:1217537:1218647 [0] NCCL INFO [Proxy Service] Device 0 CPU core 50
+lshn-qs-pjul-8:1217537:1218648 [0] NCCL INFO [Proxy Service UDS] Device 0 CPU core 151
+lshn-qs-pjul-8:1217540:1218636 [3] NCCL INFO threadThresholds 8/8/64 | 32/8/64 | 512 | 512
+lshn-qs-pjul-8:1217540:1218636 [3] NCCL INFO 24 coll channels, 24 collnet channels, 0 nvls channels, 32 p2p channels, 32 p2p channels per peer
+lshn-qs-pjul-8:1217539:1218634 [2] NCCL INFO threadThresholds 8/8/64 | 32/8/64 | 512 | 512
+lshn-qs-pjul-8:1217539:1218634 [2] NCCL INFO 24 coll channels, 24 collnet channels, 0 nvls channels, 32 p2p channels, 32 p2p channels per peer
+lshn-qs-pjul-8:1217538:1218635 [1] NCCL INFO threadThresholds 8/8/64 | 32/8/64 | 512 | 512
+lshn-qs-pjul-8:1217538:1218635 [1] NCCL INFO 24 coll channels, 24 collnet channels, 0 nvls channels, 32 p2p channels, 32 p2p channels per peer
+lshn-qs-pjul-8:1217537:1218633 [0] NCCL INFO threadThresholds 8/8/64 | 32/8/64 | 512 | 512
+lshn-qs-pjul-8:1217537:1218633 [0] NCCL INFO 24 coll channels, 24 collnet channels, 0 nvls channels, 32 p2p channels, 32 p2p channels per peer
+lshn-qs-pjul-8:1217537:1218633 [0] NCCL INFO CC Off, workFifoBytes 1048576
+lshn-qs-pjul-8:1217540:1218636 [3] NCCL INFO TUNER/Plugin: Could not find: libnccl-tuner.so. Using internal tuner plugin.
+lshn-qs-pjul-8:1217540:1218636 [3] NCCL INFO ncclCommInitRankConfig comm 0x1f12a6e0 rank 3 nranks 4 cudaDev 3 nvmlDev 7 busId 1c7000 commId 0x5b31249cba627096 - Init COMPLETE
+lshn-qs-pjul-8:1217540:1218636 [3] NCCL INFO Init timings - ncclCommInitRankConfig: rank 3 nranks 4 total 0.97 (kernels 0.21, alloc 0.58, bootstrap 0.00, allgathers 0.02, topo 0.02, graphs 0.01, connections 0.06, rest 0.06)
+lshn-qs-pjul-8:1217539:1218634 [2] NCCL INFO TUNER/Plugin: Could not find: libnccl-tuner.so. Using internal tuner plugin.
+lshn-qs-pjul-8:1217539:1218634 [2] NCCL INFO ncclCommInitRankConfig comm 0x1dced410 rank 2 nranks 4 cudaDev 2 nvmlDev 6 busId 1a3000 commId 0x5b31249cba627096 - Init COMPLETE
+lshn-qs-pjul-8:1217539:1218634 [2] NCCL INFO Init timings - ncclCommInitRankConfig: rank 2 nranks 4 total 0.97 (kernels 0.21, alloc 0.58, bootstrap 0.00, allgathers 0.02, topo 0.02, graphs 0.01, connections 0.07, rest 0.06)
+lshn-qs-pjul-8:1217538:1218635 [1] NCCL INFO TUNER/Plugin: Could not find: libnccl-tuner.so. Using internal tuner plugin.
+lshn-qs-pjul-8:1217538:1218635 [1] NCCL INFO ncclCommInitRankConfig comm 0x1ed808d0 rank 1 nranks 4 cudaDev 1 nvmlDev 5 busId 17f000 commId 0x5b31249cba627096 - Init COMPLETE
+lshn-qs-pjul-8:1217538:1218635 [1] NCCL INFO Init timings - ncclCommInitRankConfig: rank 1 nranks 4 total 0.97 (kernels 0.22, alloc 0.58, bootstrap 0.00, allgathers 0.02, topo 0.02, graphs 0.01, connections 0.07, rest 0.06)
+lshn-qs-pjul-8:1217537:1218633 [0] NCCL INFO TUNER/Plugin: Could not find: libnccl-tuner.so. Using internal tuner plugin.
+lshn-qs-pjul-8:1217537:1218633 [0] NCCL INFO ncclCommInitRankConfig comm 0x1ce20510 rank 0 nranks 4 cudaDev 0 nvmlDev 4 busId 109000 commId 0x5b31249cba627096 - Init COMPLETE
+lshn-qs-pjul-8:1217537:1218633 [0] NCCL INFO Init timings - ncclCommInitRankConfig: rank 0 nranks 4 total 1.00 (kernels 0.21, alloc 0.58, bootstrap 0.03, allgathers 0.01, topo 0.02, graphs 0.02, connections 0.06, rest 0.06)
+[OpenTinker] 2025-12-02 17:34:48,206 - root - INFO - Training model with GRPO
+[OpenTinker] 2025-12-02 17:34:48,207 - root - INFO - Training model with GRPO
+[OpenTinker] 2025-12-02 17:34:48,293 - root - INFO - Training model with GRPO
+[OpenTinker] 2025-12-02 17:34:48,293 - root - INFO - Training model with GRPO
+INFO 12-02 17:34:48 [utils.py:328] non-default args: {'seed': 0, 'max_model_len': 16896, 'distributed_executor_backend': 'external_launcher', 'gpu_memory_utilization': 0.4, 'max_num_batched_tokens': 4096, 'max_num_seqs': 32, 'logprobs_mode': 'processed_logprobs', 'disable_log_stats': True, 'model_impl': 'vllm', 'model': 'deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B'}
+INFO 12-02 17:34:48 [utils.py:328] non-default args: {'seed': 1, 'max_model_len': 16896, 'distributed_executor_backend': 'external_launcher', 'gpu_memory_utilization': 0.4, 'max_num_batched_tokens': 4096, 'max_num_seqs': 32, 'logprobs_mode': 'processed_logprobs', 'disable_log_stats': True, 'model_impl': 'vllm', 'model': 'deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B'}
+INFO 12-02 17:34:48 [utils.py:328] non-default args: {'seed': 3, 'max_model_len': 16896, 'distributed_executor_backend': 'external_launcher', 'gpu_memory_utilization': 0.4, 'max_num_batched_tokens': 4096, 'max_num_seqs': 32, 'logprobs_mode': 'processed_logprobs', 'disable_log_stats': True, 'model_impl': 'vllm', 'model': 'deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B'}
+INFO 12-02 17:34:48 [utils.py:328] non-default args: {'seed': 2, 'max_model_len': 16896, 'distributed_executor_backend': 'external_launcher', 'gpu_memory_utilization': 0.4, 'max_num_batched_tokens': 4096, 'max_num_seqs': 32, 'logprobs_mode': 'processed_logprobs', 'disable_log_stats': True, 'model_impl': 'vllm', 'model': 'deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B'}
+INFO 12-02 17:35:06 [__init__.py:742] Resolved architecture: Qwen2ForCausalLM
+INFO 12-02 17:35:06 [__init__.py:1815] Using max model len 16896
+INFO 12-02 17:35:06 [__init__.py:742] Resolved architecture: Qwen2ForCausalLM
+INFO 12-02 17:35:06 [__init__.py:1815] Using max model len 16896
+INFO 12-02 17:35:06 [__init__.py:742] Resolved architecture: Qwen2ForCausalLM
+INFO 12-02 17:35:06 [__init__.py:1815] Using max model len 16896
+INFO 12-02 17:35:06 [__init__.py:742] Resolved architecture: Qwen2ForCausalLM
+INFO 12-02 17:35:06 [__init__.py:1815] Using max model len 16896
+INFO 12-02 17:35:07 [parallel.py:348] Disabling V1 multiprocessing for external launcher.
+INFO 12-02 17:35:07 [parallel.py:348] Disabling V1 multiprocessing for external launcher.
+INFO 12-02 17:35:07 [parallel.py:348] Disabling V1 multiprocessing for external launcher.
+INFO 12-02 17:35:07 [parallel.py:348] Disabling V1 multiprocessing for external launcher.
+INFO 12-02 17:35:08 [scheduler.py:222] Chunked prefill is enabled with max_num_batched_tokens=4096.
+INFO 12-02 17:35:08 [scheduler.py:222] Chunked prefill is enabled with max_num_batched_tokens=4096.
+INFO 12-02 17:35:08 [scheduler.py:222] Chunked prefill is enabled with max_num_batched_tokens=4096.
+INFO 12-02 17:35:08 [scheduler.py:222] Chunked prefill is enabled with max_num_batched_tokens=4096.
+INFO 12-02 17:35:09 [core.py:76] Initializing a V1 LLM engine (v0.10.2) with config: model='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', speculative_config=None, tokenizer='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=16896, download_dir=None, load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, data_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, device_config=cuda, decoding_config=DecodingConfig(backend='auto', disable_fallback=False, disable_any_whitespace=False, disable_additional_properties=False, reasoning_backend=''), observability_config=ObservabilityConfig(show_hidden_metrics_for_version=None, otlp_traces_endpoint=None, collect_detailed_traces=None), seed=3, served_model_name=deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B, enable_prefix_caching=True, chunked_prefill_enabled=True, use_async_output_proc=True, pooler_config=None, compilation_config={"level":3,"debug_dump_path":"","cache_dir":"","backend":"","custom_ops":[],"splitting_ops":["vllm.unified_attention","vllm.unified_attention_with_output","vllm.mamba_mixer2","vllm.mamba_mixer","vllm.short_conv","vllm.linear_attention","vllm.plamo2_mamba_mixer","vllm.gdn_attention"],"use_inductor":true,"compile_sizes":[],"inductor_compile_config":{"enable_auto_functionalized_v2":false},"inductor_passes":{},"cudagraph_mode":1,"use_cudagraph":true,"cudagraph_num_of_warmups":1,"cudagraph_capture_sizes":[64,56,48,40,32,24,16,8,4,2,1],"cudagraph_copy_inputs":false,"full_cuda_graph":false,"pass_config":{},"max_capture_size":64,"local_cache_dir":null}
+INFO 12-02 17:35:09 [core.py:76] Initializing a V1 LLM engine (v0.10.2) with config: model='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', speculative_config=None, tokenizer='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=16896, download_dir=None, load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, data_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, device_config=cuda, decoding_config=DecodingConfig(backend='auto', disable_fallback=False, disable_any_whitespace=False, disable_additional_properties=False, reasoning_backend=''), observability_config=ObservabilityConfig(show_hidden_metrics_for_version=None, otlp_traces_endpoint=None, collect_detailed_traces=None), seed=2, served_model_name=deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B, enable_prefix_caching=True, chunked_prefill_enabled=True, use_async_output_proc=True, pooler_config=None, compilation_config={"level":3,"debug_dump_path":"","cache_dir":"","backend":"","custom_ops":[],"splitting_ops":["vllm.unified_attention","vllm.unified_attention_with_output","vllm.mamba_mixer2","vllm.mamba_mixer","vllm.short_conv","vllm.linear_attention","vllm.plamo2_mamba_mixer","vllm.gdn_attention"],"use_inductor":true,"compile_sizes":[],"inductor_compile_config":{"enable_auto_functionalized_v2":false},"inductor_passes":{},"cudagraph_mode":1,"use_cudagraph":true,"cudagraph_num_of_warmups":1,"cudagraph_capture_sizes":[64,56,48,40,32,24,16,8,4,2,1],"cudagraph_copy_inputs":false,"full_cuda_graph":false,"pass_config":{},"max_capture_size":64,"local_cache_dir":null}
+INFO 12-02 17:35:09 [core.py:76] Initializing a V1 LLM engine (v0.10.2) with config: model='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', speculative_config=None, tokenizer='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=16896, download_dir=None, load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, data_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, device_config=cuda, decoding_config=DecodingConfig(backend='auto', disable_fallback=False, disable_any_whitespace=False, disable_additional_properties=False, reasoning_backend=''), observability_config=ObservabilityConfig(show_hidden_metrics_for_version=None, otlp_traces_endpoint=None, collect_detailed_traces=None), seed=1, served_model_name=deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B, enable_prefix_caching=True, chunked_prefill_enabled=True, use_async_output_proc=True, pooler_config=None, compilation_config={"level":3,"debug_dump_path":"","cache_dir":"","backend":"","custom_ops":[],"splitting_ops":["vllm.unified_attention","vllm.unified_attention_with_output","vllm.mamba_mixer2","vllm.mamba_mixer","vllm.short_conv","vllm.linear_attention","vllm.plamo2_mamba_mixer","vllm.gdn_attention"],"use_inductor":true,"compile_sizes":[],"inductor_compile_config":{"enable_auto_functionalized_v2":false},"inductor_passes":{},"cudagraph_mode":1,"use_cudagraph":true,"cudagraph_num_of_warmups":1,"cudagraph_capture_sizes":[64,56,48,40,32,24,16,8,4,2,1],"cudagraph_copy_inputs":false,"full_cuda_graph":false,"pass_config":{},"max_capture_size":64,"local_cache_dir":null}
+INFO 12-02 17:35:09 [core.py:76] Initializing a V1 LLM engine (v0.10.2) with config: model='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', speculative_config=None, tokenizer='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=16896, download_dir=None, load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, data_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, device_config=cuda, decoding_config=DecodingConfig(backend='auto', disable_fallback=False, disable_any_whitespace=False, disable_additional_properties=False, reasoning_backend=''), observability_config=ObservabilityConfig(show_hidden_metrics_for_version=None, otlp_traces_endpoint=None, collect_detailed_traces=None), seed=0, served_model_name=deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B, enable_prefix_caching=True, chunked_prefill_enabled=True, use_async_output_proc=True, pooler_config=None, compilation_config={"level":3,"debug_dump_path":"","cache_dir":"","backend":"","custom_ops":[],"splitting_ops":["vllm.unified_attention","vllm.unified_attention_with_output","vllm.mamba_mixer2","vllm.mamba_mixer","vllm.short_conv","vllm.linear_attention","vllm.plamo2_mamba_mixer","vllm.gdn_attention"],"use_inductor":true,"compile_sizes":[],"inductor_compile_config":{"enable_auto_functionalized_v2":false},"inductor_passes":{},"cudagraph_mode":1,"use_cudagraph":true,"cudagraph_num_of_warmups":1,"cudagraph_capture_sizes":[64,56,48,40,32,24,16,8,4,2,1],"cudagraph_copy_inputs":false,"full_cuda_graph":false,"pass_config":{},"max_capture_size":64,"local_cache_dir":null}
+[rank3]:[W1202 17:35:11.819826985 ProcessGroupNCCL.cpp:981] Warning: TORCH_NCCL_AVOID_RECORD_STREAMS is the default now, this environment variable is thus deprecated. (function operator())
+lshn-qs-pjul-8:1217540:1217540 [3] NCCL INFO Comm config Blocking set to 1
+[rank1]:[W1202 17:35:11.853803987 ProcessGroupNCCL.cpp:981] Warning: TORCH_NCCL_AVOID_RECORD_STREAMS is the default now, this environment variable is thus deprecated. (function operator())
+lshn-qs-pjul-8:1217538:1217538 [1] NCCL INFO Comm config Blocking set to 1
+[rank0]:[W1202 17:35:11.898406648 ProcessGroupNCCL.cpp:981] Warning: TORCH_NCCL_AVOID_RECORD_STREAMS is the default now, this environment variable is thus deprecated. (function operator())
+lshn-qs-pjul-8:1217537:1217537 [0] NCCL INFO Comm config Blocking set to 1
+[rank2]:[W1202 17:35:11.026588609 ProcessGroupNCCL.cpp:981] Warning: TORCH_NCCL_AVOID_RECORD_STREAMS is the default now, this environment variable is thus deprecated. (function operator())
+lshn-qs-pjul-8:1217539:1217539 [2] NCCL INFO Comm config Blocking set to 1
+lshn-qs-pjul-8:1217538:1218781 [1] NCCL INFO Assigned NET plugin Socket to comm
+lshn-qs-pjul-8:1217538:1218781 [1] NCCL INFO Using network Socket
+lshn-qs-pjul-8:1217540:1218778 [3] NCCL INFO Assigned NET plugin Socket to comm
+lshn-qs-pjul-8:1217540:1218778 [3] NCCL INFO Using network Socket
+lshn-qs-pjul-8:1217539:1218787 [2] NCCL INFO Assigned NET plugin Socket to comm
+lshn-qs-pjul-8:1217539:1218787 [2] NCCL INFO Using network Socket
+lshn-qs-pjul-8:1217537:1218784 [0] NCCL INFO Assigned NET plugin Socket to comm
+lshn-qs-pjul-8:1217537:1218784 [0] NCCL INFO Using network Socket
+lshn-qs-pjul-8:1217538:1218781 [1] NCCL INFO ncclCommSplit comm 0x1fe18920 rank 1 nranks 4 cudaDev 1 nvmlDev 5 busId 17f000 parent 0x1ed808d0 splitCount 1 color 2003953581 key 1- Init START
+lshn-qs-pjul-8:1217540:1218778 [3] NCCL INFO ncclCommSplit comm 0x201b9280 rank 3 nranks 4 cudaDev 3 nvmlDev 7 busId 1c7000 parent 0x1f12a6e0 splitCount 1 color 2003953581 key 3- Init START
+lshn-qs-pjul-8:1217539:1218787 [2] NCCL INFO ncclCommSplit comm 0x1ed7b660 rank 2 nranks 4 cudaDev 2 nvmlDev 6 busId 1a3000 parent 0x1dced410 splitCount 1 color 2003953581 key 2- Init START
+lshn-qs-pjul-8:1217537:1218784 [0] NCCL INFO ncclCommSplit comm 0x1e551b60 rank 0 nranks 4 cudaDev 0 nvmlDev 4 busId 109000 parent 0x1ce20510 splitCount 1 color 2003953581 key 0- Init START
+lshn-qs-pjul-8:1217538:1218781 [1] NCCL INFO MNNVL busId 0x17f000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0
+lshn-qs-pjul-8:1217539:1218787 [2] NCCL INFO MNNVL busId 0x1a3000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0
+lshn-qs-pjul-8:1217540:1218778 [3] NCCL INFO MNNVL busId 0x1c7000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0
+lshn-qs-pjul-8:1217537:1218784 [0] NCCL INFO MNNVL busId 0x109000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0
+lshn-qs-pjul-8:1217538:1218781 [1] NCCL INFO Setting affinity for GPU 5 to 48-95,144-191
+lshn-qs-pjul-8:1217539:1218787 [2] NCCL INFO Setting affinity for GPU 6 to 48-95,144-191
+lshn-qs-pjul-8:1217537:1218784 [0] NCCL INFO Setting affinity for GPU 4 to 48-95,144-191
+lshn-qs-pjul-8:1217540:1218778 [3] NCCL INFO Setting affinity for GPU 7 to 48-95,144-191
+lshn-qs-pjul-8:1217540:1218778 [3] NCCL INFO comm 0x201b9280 rank 3 nRanks 4 nNodes 1 localRanks 4 localRank 3 MNNVL 0
+lshn-qs-pjul-8:1217539:1218787 [2] NCCL INFO comm 0x1ed7b660 rank 2 nRanks 4 nNodes 1 localRanks 4 localRank 2 MNNVL 0
+lshn-qs-pjul-8:1217537:1218784 [0] NCCL INFO comm 0x1e551b60 rank 0 nRanks 4 nNodes 1 localRanks 4 localRank 0 MNNVL 0
+lshn-qs-pjul-8:1217538:1218781 [1] NCCL INFO comm 0x1fe18920 rank 1 nRanks 4 nNodes 1 localRanks 4 localRank 1 MNNVL 0
+lshn-qs-pjul-8:1217540:1218778 [3] NCCL INFO Trees [0] -1/-1/-1->3->2 [1] -1/-1/-1->3->2 [2] -1/-1/-1->3->2 [3] -1/-1/-1->3->2 [4] -1/-1/-1->3->2 [5] -1/-1/-1->3->2 [6] -1/-1/-1->3->2 [7] -1/-1/-1->3->2 [8] -1/-1/-1->3->2 [9] -1/-1/-1->3->2 [10] -1/-1/-1->3->2 [11] -1/-1/-1->3->2 [12] -1/-1/-1->3->2 [13] -1/-1/-1->3->2 [14] -1/-1/-1->3->2 [15] -1/-1/-1->3->2 [16] -1/-1/-1->3->2 [17] -1/-1/-1->3->2 [18] -1/-1/-1->3->2 [19] -1/-1/-1->3->2 [20] -1/-1/-1->3->2 [21] -1/-1/-1->3->2 [22] -1/-1/-1->3->2 [23] -1/-1/-1->3->2
+lshn-qs-pjul-8:1217540:1218778 [3] NCCL INFO P2P Chunksize set to 524288
+lshn-qs-pjul-8:1217539:1218787 [2] NCCL INFO Trees [0] 3/-1/-1->2->1 [1] 3/-1/-1->2->1 [2] 3/-1/-1->2->1 [3] 3/-1/-1->2->1 [4] 3/-1/-1->2->1 [5] 3/-1/-1->2->1 [6] 3/-1/-1->2->1 [7] 3/-1/-1->2->1 [8] 3/-1/-1->2->1 [9] 3/-1/-1->2->1 [10] 3/-1/-1->2->1 [11] 3/-1/-1->2->1 [12] 3/-1/-1->2->1 [13] 3/-1/-1->2->1 [14] 3/-1/-1->2->1 [15] 3/-1/-1->2->1 [16] 3/-1/-1->2->1 [17] 3/-1/-1->2->1 [18] 3/-1/-1->2->1 [19] 3/-1/-1->2->1 [20] 3/-1/-1->2->1 [21] 3/-1/-1->2->1 [22] 3/-1/-1->2->1 [23] 3/-1/-1->2->1
+lshn-qs-pjul-8:1217539:1218787 [2] NCCL INFO P2P Chunksize set to 524288
+lshn-qs-pjul-8:1217537:1218784 [0] NCCL INFO Channel 00/24 : 0 1 2 3
+lshn-qs-pjul-8:1217538:1218781 [1] NCCL INFO Trees [0] 2/-1/-1->1->0 [1] 2/-1/-1->1->0 [2] 2/-1/-1->1->0 [3] 2/-1/-1->1->0 [4] 2/-1/-1->1->0 [5] 2/-1/-1->1->0 [6] 2/-1/-1->1->0 [7] 2/-1/-1->1->0 [8] 2/-1/-1->1->0 [9] 2/-1/-1->1->0 [10] 2/-1/-1->1->0 [11] 2/-1/-1->1->0 [12] 2/-1/-1->1->0 [13] 2/-1/-1->1->0 [14] 2/-1/-1->1->0 [15] 2/-1/-1->1->0 [16] 2/-1/-1->1->0 [17] 2/-1/-1->1->0 [18] 2/-1/-1->1->0 [19] 2/-1/-1->1->0 [20] 2/-1/-1->1->0 [21] 2/-1/-1->1->0 [22] 2/-1/-1->1->0 [23] 2/-1/-1->1->0
+lshn-qs-pjul-8:1217537:1218784 [0] NCCL INFO Channel 01/24 : 0 1 2 3
+lshn-qs-pjul-8:1217538:1218781 [1] NCCL INFO P2P Chunksize set to 524288
+lshn-qs-pjul-8:1217537:1218784 [0] NCCL INFO Channel 02/24 : 0 1 2 3
+lshn-qs-pjul-8:1217537:1218784 [0] NCCL INFO Channel 03/24 : 0 1 2 3
+lshn-qs-pjul-8:1217537:1218784 [0] NCCL INFO Channel 04/24 : 0 1 2 3
+lshn-qs-pjul-8:1217537:1218784 [0] NCCL INFO Channel 05/24 : 0 1 2 3
+lshn-qs-pjul-8:1217537:1218784 [0] NCCL INFO Channel 06/24 : 0 1 2 3
+lshn-qs-pjul-8:1217537:1218784 [0] NCCL INFO Channel 07/24 : 0 1 2 3
+lshn-qs-pjul-8:1217537:1218784 [0] NCCL INFO Channel 08/24 : 0 1 2 3
+lshn-qs-pjul-8:1217537:1218784 [0] NCCL INFO Channel 09/24 : 0 1 2 3
+lshn-qs-pjul-8:1217537:1218784 [0] NCCL INFO Channel 10/24 : 0 1 2 3
+lshn-qs-pjul-8:1217537:1218784 [0] NCCL INFO Channel 11/24 : 0 1 2 3
+lshn-qs-pjul-8:1217537:1218784 [0] NCCL INFO Channel 12/24 : 0 1 2 3
+lshn-qs-pjul-8:1217537:1218784 [0] NCCL INFO Channel 13/24 : 0 1 2 3
+lshn-qs-pjul-8:1217537:1218784 [0] NCCL INFO Channel 14/24 : 0 1 2 3
+lshn-qs-pjul-8:1217537:1218784 [0] NCCL INFO Channel 15/24 : 0 1 2 3
+lshn-qs-pjul-8:1217537:1218784 [0] NCCL INFO Channel 16/24 : 0 1 2 3
+lshn-qs-pjul-8:1217537:1218784 [0] NCCL INFO Channel 17/24 : 0 1 2 3
+lshn-qs-pjul-8:1217537:1218784 [0] NCCL INFO Channel 18/24 : 0 1 2 3
+lshn-qs-pjul-8:1217537:1218784 [0] NCCL INFO Channel 19/24 : 0 1 2 3
+lshn-qs-pjul-8:1217537:1218784 [0] NCCL INFO Channel 20/24 : 0 1 2 3
+lshn-qs-pjul-8:1217537:1218784 [0] NCCL INFO Channel 21/24 : 0 1 2 3
+lshn-qs-pjul-8:1217537:1218784 [0] NCCL INFO Channel 22/24 : 0 1 2 3
+lshn-qs-pjul-8:1217537:1218784 [0] NCCL INFO Channel 23/24 : 0 1 2 3
+lshn-qs-pjul-8:1217537:1218784 [0] NCCL INFO Trees [0] 1/-1/-1->0->-1 [1] 1/-1/-1->0->-1 [2] 1/-1/-1->0->-1 [3] 1/-1/-1->0->-1 [4] 1/-1/-1->0->-1 [5] 1/-1/-1->0->-1 [6] 1/-1/-1->0->-1 [7] 1/-1/-1->0->-1 [8] 1/-1/-1->0->-1 [9] 1/-1/-1->0->-1 [10] 1/-1/-1->0->-1 [11] 1/-1/-1->0->-1 [12] 1/-1/-1->0->-1 [13] 1/-1/-1->0->-1 [14] 1/-1/-1->0->-1 [15] 1/-1/-1->0->-1 [16] 1/-1/-1->0->-1 [17] 1/-1/-1->0->-1 [18] 1/-1/-1->0->-1 [19] 1/-1/-1->0->-1 [20] 1/-1/-1->0->-1 [21] 1/-1/-1->0->-1 [22] 1/-1/-1->0->-1 [23] 1/-1/-1->0->-1
+lshn-qs-pjul-8:1217537:1218784 [0] NCCL INFO P2P Chunksize set to 524288
+lshn-qs-pjul-8:1217540:1218788 [3] NCCL INFO [Proxy Service] Device 3 CPU core 72
+lshn-qs-pjul-8:1217540:1218789 [3] NCCL INFO [Proxy Service UDS] Device 3 CPU core 76
+lshn-qs-pjul-8:1217539:1218790 [2] NCCL INFO [Proxy Service] Device 2 CPU core 80
+lshn-qs-pjul-8:1217539:1218791 [2] NCCL INFO [Proxy Service UDS] Device 2 CPU core 181
+lshn-qs-pjul-8:1217538:1218792 [1] NCCL INFO [Proxy Service] Device 1 CPU core 87
+lshn-qs-pjul-8:1217538:1218793 [1] NCCL INFO [Proxy Service UDS] Device 1 CPU core 189
+lshn-qs-pjul-8:1217537:1218784 [0] NCCL INFO Check P2P Type isAllDirectP2p 1 directMode 0
+lshn-qs-pjul-8:1217537:1218794 [0] NCCL INFO [Proxy Service] Device 0 CPU core 190
+lshn-qs-pjul-8:1217537:1218795 [0] NCCL INFO [Proxy Service UDS] Device 0 CPU core 155
+lshn-qs-pjul-8:1217540:1218778 [3] NCCL INFO threadThresholds 8/8/64 | 32/8/64 | 512 | 512
+lshn-qs-pjul-8:1217540:1218778 [3] NCCL INFO 24 coll channels, 24 collnet channels, 0 nvls channels, 32 p2p channels, 32 p2p channels per peer
+lshn-qs-pjul-8:1217538:1218781 [1] NCCL INFO threadThresholds 8/8/64 | 32/8/64 | 512 | 512
+lshn-qs-pjul-8:1217538:1218781 [1] NCCL INFO 24 coll channels, 24 collnet channels, 0 nvls channels, 32 p2p channels, 32 p2p channels per peer
+lshn-qs-pjul-8:1217539:1218787 [2] NCCL INFO threadThresholds 8/8/64 | 32/8/64 | 512 | 512
+lshn-qs-pjul-8:1217539:1218787 [2] NCCL INFO 24 coll channels, 24 collnet channels, 0 nvls channels, 32 p2p channels, 32 p2p channels per peer
+lshn-qs-pjul-8:1217537:1218784 [0] NCCL INFO threadThresholds 8/8/64 | 32/8/64 | 512 | 512
+lshn-qs-pjul-8:1217537:1218784 [0] NCCL INFO 24 coll channels, 24 collnet channels, 0 nvls channels, 32 p2p channels, 32 p2p channels per peer
+lshn-qs-pjul-8:1217537:1218784 [0] NCCL INFO CC Off, workFifoBytes 1048576
+lshn-qs-pjul-8:1217540:1218778 [3] NCCL INFO ncclCommSplit comm 0x201b9280 rank 3 nranks 4 cudaDev 3 nvmlDev 7 busId 1c7000 parent 0x1f12a6e0 splitCount 1 color 2003953581 key 3 - Init COMPLETE
+lshn-qs-pjul-8:1217538:1218781 [1] NCCL INFO ncclCommSplit comm 0x1fe18920 rank 1 nranks 4 cudaDev 1 nvmlDev 5 busId 17f000 parent 0x1ed808d0 splitCount 1 color 2003953581 key 1 - Init COMPLETE
+lshn-qs-pjul-8:1217539:1218787 [2] NCCL INFO ncclCommSplit comm 0x1ed7b660 rank 2 nranks 4 cudaDev 2 nvmlDev 6 busId 1a3000 parent 0x1dced410 splitCount 1 color 2003953581 key 2 - Init COMPLETE
+lshn-qs-pjul-8:1217537:1218784 [0] NCCL INFO ncclCommSplit comm 0x1e551b60 rank 0 nranks 4 cudaDev 0 nvmlDev 4 busId 109000 parent 0x1ce20510 splitCount 1 color 2003953581 key 0 - Init COMPLETE
+lshn-qs-pjul-8:1217540:1218778 [3] NCCL INFO Init timings - ncclCommSplit: rank 3 nranks 4 total 0.29 (kernels 0.00, alloc 0.00, bootstrap 0.00, allgathers 0.01, topo 0.03, graphs 0.01, connections 0.02, rest 0.22)
+lshn-qs-pjul-8:1217538:1218781 [1] NCCL INFO Init timings - ncclCommSplit: rank 1 nranks 4 total 0.26 (kernels 0.00, alloc 0.00, bootstrap 0.00, allgathers 0.01, topo 0.03, graphs 0.01, connections 0.02, rest 0.18)
+lshn-qs-pjul-8:1217539:1218787 [2] NCCL INFO Init timings - ncclCommSplit: rank 2 nranks 4 total 0.09 (kernels 0.00, alloc 0.00, bootstrap 0.00, allgathers 0.01, topo 0.03, graphs 0.01, connections 0.02, rest 0.01)
+lshn-qs-pjul-8:1217537:1218784 [0] NCCL INFO Init timings - ncclCommSplit: rank 0 nranks 4 total 0.21 (kernels 0.00, alloc 0.00, bootstrap 0.00, allgathers 0.00, topo 0.03, graphs 0.02, connections 0.02, rest 0.14)
+[Gloo] Rank 0 is connected to 3[Gloo] Rank [Gloo] Rank  peer ranks. Expected number of connected peer ranks is : 1 is connected to [Gloo] Rank 32 is connected to 33 peer ranks.  is connected to 3 peer ranks. 
+Expected number of connected peer ranks is : 33 peer ranks. Expected number of connected peer ranks is : 3
+Expected number of connected peer ranks is : 3
+
+lshn-qs-pjul-8:1217540:1217540 [3] NCCL INFO Rank 3 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-pjul-8:1217539:1217539 [2] NCCL INFO Rank 2 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-pjul-8:1217538:1217538 [1] NCCL INFO Rank 1 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-pjul-8:1217537:1217537 [0] NCCL INFO Comm config Blocking set to 1
+lshn-qs-pjul-8:1217537:1218814 [0] NCCL INFO Assigned NET plugin Socket to comm
+lshn-qs-pjul-8:1217537:1218814 [0] NCCL INFO Using network Socket
+lshn-qs-pjul-8:1217538:1217538 [1] NCCL INFO Rank 1 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-pjul-8:1217540:1217540 [3] NCCL INFO Rank 3 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-pjul-8:1217539:1217539 [2] NCCL INFO Rank 2 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-pjul-8:1217537:1218814 [0] NCCL INFO ncclCommSplit comm 0x1e665b00 rank 0 nranks 1 cudaDev 0 nvmlDev 4 busId 109000 parent 0x1ce20510 splitCount 2 color 59908776 key 0- Init START
+lshn-qs-pjul-8:1217537:1218814 [0] NCCL INFO MNNVL busId 0x109000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0
+lshn-qs-pjul-8:1217537:1218814 [0] NCCL INFO Setting affinity for GPU 4 to 48-95,144-191
+lshn-qs-pjul-8:1217537:1218814 [0] NCCL INFO comm 0x1e665b00 rank 0 nRanks 1 nNodes 1 localRanks 1 localRank 0 MNNVL 0
+lshn-qs-pjul-8:1217537:1218814 [0] NCCL INFO Channel 00/64 : 0
+lshn-qs-pjul-8:1217537:1218814 [0] NCCL INFO Channel 01/64 : 0
+lshn-qs-pjul-8:1217537:1218814 [0] NCCL INFO Channel 02/64 : 0
+lshn-qs-pjul-8:1217537:1218814 [0] NCCL INFO Channel 03/64 : 0
+lshn-qs-pjul-8:1217537:1218814 [0] NCCL INFO Channel 04/64 : 0
+lshn-qs-pjul-8:1217537:1218814 [0] NCCL INFO Channel 05/64 : 0
+lshn-qs-pjul-8:1217537:1218814 [0] NCCL INFO Channel 06/64 : 0
+lshn-qs-pjul-8:1217537:1218814 [0] NCCL INFO Channel 07/64 : 0
+lshn-qs-pjul-8:1217537:1218814 [0] NCCL INFO Channel 08/64 : 0
+lshn-qs-pjul-8:1217537:1218814 [0] NCCL INFO Channel 09/64 : 0
+lshn-qs-pjul-8:1217537:1218814 [0] NCCL INFO Channel 10/64 : 0
+lshn-qs-pjul-8:1217537:1218814 [0] NCCL INFO Channel 11/64 : 0
+lshn-qs-pjul-8:1217537:1218814 [0] NCCL INFO Channel 12/64 : 0
+lshn-qs-pjul-8:1217537:1218814 [0] NCCL INFO Channel 13/64 : 0
+lshn-qs-pjul-8:1217537:1218814 [0] NCCL INFO Channel 14/64 : 0
+lshn-qs-pjul-8:1217537:1218814 [0] NCCL INFO Channel 15/64 : 0
+lshn-qs-pjul-8:1217537:1218814 [0] NCCL INFO Channel 16/64 : 0
+lshn-qs-pjul-8:1217537:1218814 [0] NCCL INFO Channel 17/64 : 0
+lshn-qs-pjul-8:1217537:1218814 [0] NCCL INFO Channel 18/64 : 0
+lshn-qs-pjul-8:1217537:1218814 [0] NCCL INFO Channel 19/64 : 0
+lshn-qs-pjul-8:1217537:1218814 [0] NCCL INFO Channel 20/64 : 0
+lshn-qs-pjul-8:1217537:1218814 [0] NCCL INFO Channel 21/64 : 0
+lshn-qs-pjul-8:1217537:1218814 [0] NCCL INFO Channel 22/64 : 0
+lshn-qs-pjul-8:1217537:1218814 [0] NCCL INFO Channel 23/64 : 0
+lshn-qs-pjul-8:1217537:1218814 [0] NCCL INFO Channel 24/64 : 0
+lshn-qs-pjul-8:1217537:1218814 [0] NCCL INFO Channel 25/64 : 0
+lshn-qs-pjul-8:1217537:1218814 [0] NCCL INFO Channel 26/64 : 0
+lshn-qs-pjul-8:1217537:1218814 [0] NCCL INFO Channel 27/64 : 0
+lshn-qs-pjul-8:1217537:1218814 [0] NCCL INFO Channel 28/64 : 0
+lshn-qs-pjul-8:1217537:1218814 [0] NCCL INFO Channel 29/64 : 0
+lshn-qs-pjul-8:1217537:1218814 [0] NCCL INFO Channel 30/64 : 0
+lshn-qs-pjul-8:1217537:1218814 [0] NCCL INFO Channel 31/64 : 0
+lshn-qs-pjul-8:1217537:1218814 [0] NCCL INFO Channel 32/64 : 0
+lshn-qs-pjul-8:1217537:1218814 [0] NCCL INFO Channel 33/64 : 0
+lshn-qs-pjul-8:1217537:1218814 [0] NCCL INFO Channel 34/64 : 0
+lshn-qs-pjul-8:1217537:1218814 [0] NCCL INFO Channel 35/64 : 0
+lshn-qs-pjul-8:1217537:1218814 [0] NCCL INFO Channel 36/64 : 0
+lshn-qs-pjul-8:1217537:1218814 [0] NCCL INFO Channel 37/64 : 0
+lshn-qs-pjul-8:1217537:1218814 [0] NCCL INFO Channel 38/64 : 0
+lshn-qs-pjul-8:1217537:1218814 [0] NCCL INFO Channel 39/64 : 0
+lshn-qs-pjul-8:1217537:1218814 [0] NCCL INFO Channel 40/64 : 0
+lshn-qs-pjul-8:1217537:1218814 [0] NCCL INFO Channel 41/64 : 0
+lshn-qs-pjul-8:1217537:1218814 [0] NCCL INFO Channel 42/64 : 0
+lshn-qs-pjul-8:1217537:1218814 [0] NCCL INFO Channel 43/64 : 0
+lshn-qs-pjul-8:1217537:1218814 [0] NCCL INFO Channel 44/64 : 0
+lshn-qs-pjul-8:1217537:1218814 [0] NCCL INFO Channel 45/64 : 0
+lshn-qs-pjul-8:1217537:1218814 [0] NCCL INFO Channel 46/64 : 0
+lshn-qs-pjul-8:1217537:1218814 [0] NCCL INFO Channel 47/64 : 0
+lshn-qs-pjul-8:1217537:1218814 [0] NCCL INFO Channel 48/64 : 0
+lshn-qs-pjul-8:1217537:1218814 [0] NCCL INFO Channel 49/64 : 0
+lshn-qs-pjul-8:1217537:1218814 [0] NCCL INFO Channel 50/64 : 0
+lshn-qs-pjul-8:1217537:1218814 [0] NCCL INFO Channel 51/64 : 0
+lshn-qs-pjul-8:1217537:1218814 [0] NCCL INFO Channel 52/64 : 0
+lshn-qs-pjul-8:1217537:1218814 [0] NCCL INFO Channel 53/64 : 0
+lshn-qs-pjul-8:1217537:1218814 [0] NCCL INFO Channel 54/64 : 0
+lshn-qs-pjul-8:1217537:1218814 [0] NCCL INFO Channel 55/64 : 0
+lshn-qs-pjul-8:1217537:1218814 [0] NCCL INFO Channel 56/64 : 0
+lshn-qs-pjul-8:1217537:1218814 [0] NCCL INFO Channel 57/64 : 0
+lshn-qs-pjul-8:1217537:1218814 [0] NCCL INFO Channel 58/64 : 0
+lshn-qs-pjul-8:1217537:1218814 [0] NCCL INFO Channel 59/64 : 0
+lshn-qs-pjul-8:1217537:1218814 [0] NCCL INFO Channel 60/64 : 0
+lshn-qs-pjul-8:1217537:1218814 [0] NCCL INFO Channel 61/64 : 0
+lshn-qs-pjul-8:1217537:1218814 [0] NCCL INFO Channel 62/64 : 0
+lshn-qs-pjul-8:1217537:1218814 [0] NCCL INFO Channel 63/64 : 0
+lshn-qs-pjul-8:1217537:1218814 [0] NCCL INFO Trees [0] -1/-1/-1->0->-1 [1] -1/-1/-1->0->-1 [2] -1/-1/-1->0->-1 [3] -1/-1/-1->0->-1 [4] -1/-1/-1->0->-1 [5] -1/-1/-1->0->-1 [6] -1/-1/-1->0->-1 [7] -1/-1/-1->0->-1 [8] -1/-1/-1->0->-1 [9] -1/-1/-1->0->-1 [10] -1/-1/-1->0->-1 [11] -1/-1/-1->0->-1 [12] -1/-1/-1->0->-1 [13] -1/-1/-1->0->-1 [14] -1/-1/-1->0->-1 [15] -1/-1/-1->0->-1 [16] -1/-1/-1->0->-1 [17] -1/-1/-1->0->-1 [18] -1/-1/-1->0->-1 [19] -1/-1/-1->0->-1 [20] -1/-1/-1->0->-1 [21] -1/-1/-1->0->-1 [22] -1/-1/-1->0->-1 [23] -1/-1/-1->0->-1 [24] -1/-1/-1->0->-1 [25] -1/-1/-1->0->-1 [26] -1/-1/-1->0->-1 [27] -1/-1/-1->0->-1 [28] -1/-1/-1->0->-1 [29] -1/-1/-1->0->-1 [30] -1/-1/-1->0->-1 [31] -1/-1/-1->0->-1 [32] -1/-1/-1->0->-1 [33] -1/-1/-1->0->-1 [34] -1/-1/-1->0->-1 [35] -1/-1/-1->0->-1 [36] -1/-1/-1->0->-1 [37] -1/-1/-1->0->-1 [38] -1/-1/-1->0->-1 [39] -1/-1/-1->0->-1 [40] -1/-1/-1->0->-1 [41] -1/-1/-1->0->-1 [42] -1/-1/-1->0->-1 [43] -1/-1/-1->0->-1 [44] -1/-1/-1->0->-1 [45] -1/-1/-1->0->-1 [46] -1/-1/-1->0
+lshn-qs-pjul-8:1217537:1218814 [0] NCCL INFO P2P Chunksize set to 524288
+lshn-qs-pjul-8:1217537:1218814 [0] NCCL INFO Check P2P Type isAllDirectP2p 0 directMode 0
+lshn-qs-pjul-8:1217537:1218818 [0] NCCL INFO [Proxy Service] Device 0 CPU core 67
+lshn-qs-pjul-8:1217537:1218819 [0] NCCL INFO [Proxy Service UDS] Device 0 CPU core 68
+lshn-qs-pjul-8:1217537:1218814 [0] NCCL INFO 64 coll channels, 64 collnet channels, 0 nvls channels, 64 p2p channels, 64 p2p channels per peer
+lshn-qs-pjul-8:1217537:1218814 [0] NCCL INFO CC Off, workFifoBytes 1048576
+lshn-qs-pjul-8:1217537:1218814 [0] NCCL INFO ncclCommSplit comm 0x1e665b00 rank 0 nranks 1 cudaDev 0 nvmlDev 4 busId 109000 parent 0x1ce20510 splitCount 2 color 59908776 key 0 - Init COMPLETE
+lshn-qs-pjul-8:1217537:1218814 [0] NCCL INFO Init timings - ncclCommSplit: rank 0 nranks 1 total 0.03 (kernels 0.00, alloc 0.00, bootstrap 0.00, allgathers 0.00, topo 0.01, graphs 0.00, connections 0.02, rest 0.00)
+[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
+lshn-qs-pjul-8:1217537:1217537 [0] NCCL INFO Rank 0 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-pjul-8:1217539:1217539 [2] NCCL INFO Rank 2 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-pjul-8:1217537:1217537 [0] NCCL INFO Rank 0 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-pjul-8:1217540:1217540 [3] NCCL INFO Rank 3 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-pjul-8:1217538:1217538 [1] NCCL INFO Comm config Blocking set to 1
+lshn-qs-pjul-8:1217538:1218829 [1] NCCL INFO Assigned NET plugin Socket to comm
+lshn-qs-pjul-8:1217538:1218829 [1] NCCL INFO Using network Socket
+lshn-qs-pjul-8:1217539:1217539 [2] NCCL INFO Rank 2 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-pjul-8:1217537:1217537 [0] NCCL INFO Rank 0 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-pjul-8:1217540:1217540 [3] NCCL INFO Rank 3 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-pjul-8:1217538:1218829 [1] NCCL INFO ncclCommSplit comm 0x1ff2b010 rank 0 nranks 1 cudaDev 1 nvmlDev 5 busId 17f000 parent 0x1ed808d0 splitCount 4 color 440515407 key 0- Init START
+lshn-qs-pjul-8:1217538:1218829 [1] NCCL INFO MNNVL busId 0x17f000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0
+lshn-qs-pjul-8:1217538:1218829 [1] NCCL INFO Setting affinity for GPU 5 to 48-95,144-191
+lshn-qs-pjul-8:1217538:1218829 [1] NCCL INFO comm 0x1ff2b010 rank 0 nRanks 1 nNodes 1 localRanks 1 localRank 0 MNNVL 0
+lshn-qs-pjul-8:1217538:1218829 [1] NCCL INFO Channel 00/64 : 0
+lshn-qs-pjul-8:1217538:1218829 [1] NCCL INFO Channel 01/64 : 0
+lshn-qs-pjul-8:1217538:1218829 [1] NCCL INFO Channel 02/64 : 0
+lshn-qs-pjul-8:1217538:1218829 [1] NCCL INFO Channel 03/64 : 0
+lshn-qs-pjul-8:1217538:1218829 [1] NCCL INFO Channel 04/64 : 0
+lshn-qs-pjul-8:1217538:1218829 [1] NCCL INFO Channel 05/64 : 0
+lshn-qs-pjul-8:1217538:1218829 [1] NCCL INFO Channel 06/64 : 0
+lshn-qs-pjul-8:1217538:1218829 [1] NCCL INFO Channel 07/64 : 0
+lshn-qs-pjul-8:1217538:1218829 [1] NCCL INFO Channel 08/64 : 0
+lshn-qs-pjul-8:1217538:1218829 [1] NCCL INFO Channel 09/64 : 0
+lshn-qs-pjul-8:1217538:1218829 [1] NCCL INFO Channel 10/64 : 0
+lshn-qs-pjul-8:1217538:1218829 [1] NCCL INFO Channel 11/64 : 0
+lshn-qs-pjul-8:1217538:1218829 [1] NCCL INFO Channel 12/64 : 0
+lshn-qs-pjul-8:1217538:1218829 [1] NCCL INFO Channel 13/64 : 0
+lshn-qs-pjul-8:1217538:1218829 [1] NCCL INFO Channel 14/64 : 0
+lshn-qs-pjul-8:1217538:1218829 [1] NCCL INFO Channel 15/64 : 0
+lshn-qs-pjul-8:1217538:1218829 [1] NCCL INFO Channel 16/64 : 0
+lshn-qs-pjul-8:1217538:1218829 [1] NCCL INFO Channel 17/64 : 0
+lshn-qs-pjul-8:1217538:1218829 [1] NCCL INFO Channel 18/64 : 0
+lshn-qs-pjul-8:1217538:1218829 [1] NCCL INFO Channel 19/64 : 0
+lshn-qs-pjul-8:1217538:1218829 [1] NCCL INFO Channel 20/64 : 0
+lshn-qs-pjul-8:1217538:1218829 [1] NCCL INFO Channel 21/64 : 0
+lshn-qs-pjul-8:1217538:1218829 [1] NCCL INFO Channel 22/64 : 0
+lshn-qs-pjul-8:1217538:1218829 [1] NCCL INFO Channel 23/64 : 0
+lshn-qs-pjul-8:1217538:1218829 [1] NCCL INFO Channel 24/64 : 0
+lshn-qs-pjul-8:1217538:1218829 [1] NCCL INFO Channel 25/64 : 0
+lshn-qs-pjul-8:1217538:1218829 [1] NCCL INFO Channel 26/64 : 0
+lshn-qs-pjul-8:1217538:1218829 [1] NCCL INFO Channel 27/64 : 0
+lshn-qs-pjul-8:1217538:1218829 [1] NCCL INFO Channel 28/64 : 0
+lshn-qs-pjul-8:1217538:1218829 [1] NCCL INFO Channel 29/64 : 0
+lshn-qs-pjul-8:1217538:1218829 [1] NCCL INFO Channel 30/64 : 0
+lshn-qs-pjul-8:1217538:1218829 [1] NCCL INFO Channel 31/64 : 0
+lshn-qs-pjul-8:1217538:1218829 [1] NCCL INFO Channel 32/64 : 0
+lshn-qs-pjul-8:1217538:1218829 [1] NCCL INFO Channel 33/64 : 0
+lshn-qs-pjul-8:1217538:1218829 [1] NCCL INFO Channel 34/64 : 0
+lshn-qs-pjul-8:1217538:1218829 [1] NCCL INFO Channel 35/64 : 0
+lshn-qs-pjul-8:1217538:1218829 [1] NCCL INFO Channel 36/64 : 0
+lshn-qs-pjul-8:1217538:1218829 [1] NCCL INFO Channel 37/64 : 0
+lshn-qs-pjul-8:1217538:1218829 [1] NCCL INFO Channel 38/64 : 0
+lshn-qs-pjul-8:1217538:1218829 [1] NCCL INFO Channel 39/64 : 0
+lshn-qs-pjul-8:1217538:1218829 [1] NCCL INFO Channel 40/64 : 0
+lshn-qs-pjul-8:1217538:1218829 [1] NCCL INFO Channel 41/64 : 0
+lshn-qs-pjul-8:1217538:1218829 [1] NCCL INFO Channel 42/64 : 0
+lshn-qs-pjul-8:1217538:1218829 [1] NCCL INFO Channel 43/64 : 0
+lshn-qs-pjul-8:1217538:1218829 [1] NCCL INFO Channel 44/64 : 0
+lshn-qs-pjul-8:1217538:1218829 [1] NCCL INFO Channel 45/64 : 0
+lshn-qs-pjul-8:1217538:1218829 [1] NCCL INFO Channel 46/64 : 0
+lshn-qs-pjul-8:1217538:1218829 [1] NCCL INFO Channel 47/64 : 0
+lshn-qs-pjul-8:1217538:1218829 [1] NCCL INFO Channel 48/64 : 0
+lshn-qs-pjul-8:1217538:1218829 [1] NCCL INFO Channel 49/64 : 0
+lshn-qs-pjul-8:1217538:1218829 [1] NCCL INFO Channel 50/64 : 0
+lshn-qs-pjul-8:1217538:1218829 [1] NCCL INFO Channel 51/64 : 0
+lshn-qs-pjul-8:1217538:1218829 [1] NCCL INFO Channel 52/64 : 0
+lshn-qs-pjul-8:1217538:1218829 [1] NCCL INFO Channel 53/64 : 0
+lshn-qs-pjul-8:1217538:1218829 [1] NCCL INFO Channel 54/64 : 0
+lshn-qs-pjul-8:1217538:1218829 [1] NCCL INFO Channel 55/64 : 0
+lshn-qs-pjul-8:1217538:1218829 [1] NCCL INFO Channel 56/64 : 0
+lshn-qs-pjul-8:1217538:1218829 [1] NCCL INFO Channel 57/64 : 0
+lshn-qs-pjul-8:1217538:1218829 [1] NCCL INFO Channel 58/64 : 0
+lshn-qs-pjul-8:1217538:1218829 [1] NCCL INFO Channel 59/64 : 0
+lshn-qs-pjul-8:1217538:1218829 [1] NCCL INFO Channel 60/64 : 0
+lshn-qs-pjul-8:1217538:1218829 [1] NCCL INFO Channel 61/64 : 0
+lshn-qs-pjul-8:1217538:1218829 [1] NCCL INFO Channel 62/64 : 0
+lshn-qs-pjul-8:1217538:1218829 [1] NCCL INFO Channel 63/64 : 0
+lshn-qs-pjul-8:1217538:1218829 [1] NCCL INFO Trees [0] -1/-1/-1->0->-1 [1] -1/-1/-1->0->-1 [2] -1/-1/-1->0->-1 [3] -1/-1/-1->0->-1 [4] -1/-1/-1->0->-1 [5] -1/-1/-1->0->-1 [6] -1/-1/-1->0->-1 [7] -1/-1/-1->0->-1 [8] -1/-1/-1->0->-1 [9] -1/-1/-1->0->-1 [10] -1/-1/-1->0->-1 [11] -1/-1/-1->0->-1 [12] -1/-1/-1->0->-1 [13] -1/-1/-1->0->-1 [14] -1/-1/-1->0->-1 [15] -1/-1/-1->0->-1 [16] -1/-1/-1->0->-1 [17] -1/-1/-1->0->-1 [18] -1/-1/-1->0->-1 [19] -1/-1/-1->0->-1 [20] -1/-1/-1->0->-1 [21] -1/-1/-1->0->-1 [22] -1/-1/-1->0->-1 [23] -1/-1/-1->0->-1 [24] -1/-1/-1->0->-1 [25] -1/-1/-1->0->-1 [26] -1/-1/-1->0->-1 [27] -1/-1/-1->0->-1 [28] -1/-1/-1->0->-1 [29] -1/-1/-1->0->-1 [30] -1/-1/-1->0->-1 [31] -1/-1/-1->0->-1 [32] -1/-1/-1->0->-1 [33] -1/-1/-1->0->-1 [34] -1/-1/-1->0->-1 [35] -1/-1/-1->0->-1 [36] -1/-1/-1->0->-1 [37] -1/-1/-1->0->-1 [38] -1/-1/-1->0->-1 [39] -1/-1/-1->0->-1 [40] -1/-1/-1->0->-1 [41] -1/-1/-1->0->-1 [42] -1/-1/-1->0->-1 [43] -1/-1/-1->0->-1 [44] -1/-1/-1->0->-1 [45] -1/-1/-1->0->-1 [46] -1/-1/-1->0
+lshn-qs-pjul-8:1217538:1218829 [1] NCCL INFO P2P Chunksize set to 524288
+lshn-qs-pjul-8:1217538:1218829 [1] NCCL INFO Check P2P Type isAllDirectP2p 0 directMode 0
+lshn-qs-pjul-8:1217538:1218833 [1] NCCL INFO [Proxy Service] Device 1 CPU core 63
+lshn-qs-pjul-8:1217538:1218834 [1] NCCL INFO [Proxy Service UDS] Device 1 CPU core 73
+lshn-qs-pjul-8:1217538:1218829 [1] NCCL INFO 64 coll channels, 64 collnet channels, 0 nvls channels, 64 p2p channels, 64 p2p channels per peer
+lshn-qs-pjul-8:1217538:1218829 [1] NCCL INFO CC Off, workFifoBytes 1048576
+lshn-qs-pjul-8:1217538:1218829 [1] NCCL INFO ncclCommSplit comm 0x1ff2b010 rank 0 nranks 1 cudaDev 1 nvmlDev 5 busId 17f000 parent 0x1ed808d0 splitCount 4 color 440515407 key 0 - Init COMPLETE
+lshn-qs-pjul-8:1217538:1218829 [1] NCCL INFO Init timings - ncclCommSplit: rank 0 nranks 1 total 0.03 (kernels 0.00, alloc 0.00, bootstrap 0.00, allgathers 0.00, topo 0.01, graphs 0.00, connections 0.02, rest 0.00)
+[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
+lshn-qs-pjul-8:1217538:1217538 [1] NCCL INFO Rank 1 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-pjul-8:1217538:1217538 [1] NCCL INFO Rank 1 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-pjul-8:1217540:1217540 [3] NCCL INFO Rank 3 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-pjul-8:1217537:1217537 [0] NCCL INFO Rank 0 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-pjul-8:1217539:1217539 [2] NCCL INFO Comm config Blocking set to 1
+lshn-qs-pjul-8:1217539:1218844 [2] NCCL INFO Assigned NET plugin Socket to comm
+lshn-qs-pjul-8:1217539:1218844 [2] NCCL INFO Using network Socket
+lshn-qs-pjul-8:1217537:1217537 [0] NCCL INFO Rank 0 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-pjul-8:1217538:1217538 [1] NCCL INFO Rank 1 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-pjul-8:1217540:1217540 [3] NCCL INFO Rank 3 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-pjul-8:1217539:1218844 [2] NCCL INFO ncclCommSplit comm 0x1ee8fe50 rank 0 nranks 1 cudaDev 2 nvmlDev 6 busId 1a3000 parent 0x1dced410 splitCount 6 color 1227022723 key 0- Init START
+lshn-qs-pjul-8:1217539:1218844 [2] NCCL INFO MNNVL busId 0x1a3000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0
+lshn-qs-pjul-8:1217539:1218844 [2] NCCL INFO Setting affinity for GPU 6 to 48-95,144-191
+lshn-qs-pjul-8:1217539:1218844 [2] NCCL INFO comm 0x1ee8fe50 rank 0 nRanks 1 nNodes 1 localRanks 1 localRank 0 MNNVL 0
+lshn-qs-pjul-8:1217539:1218844 [2] NCCL INFO Channel 00/64 : 0
+lshn-qs-pjul-8:1217539:1218844 [2] NCCL INFO Channel 01/64 : 0
+lshn-qs-pjul-8:1217539:1218844 [2] NCCL INFO Channel 02/64 : 0
+lshn-qs-pjul-8:1217539:1218844 [2] NCCL INFO Channel 03/64 : 0
+lshn-qs-pjul-8:1217539:1218844 [2] NCCL INFO Channel 04/64 : 0
+lshn-qs-pjul-8:1217539:1218844 [2] NCCL INFO Channel 05/64 : 0
+lshn-qs-pjul-8:1217539:1218844 [2] NCCL INFO Channel 06/64 : 0
+lshn-qs-pjul-8:1217539:1218844 [2] NCCL INFO Channel 07/64 : 0
+lshn-qs-pjul-8:1217539:1218844 [2] NCCL INFO Channel 08/64 : 0
+lshn-qs-pjul-8:1217539:1218844 [2] NCCL INFO Channel 09/64 : 0
+lshn-qs-pjul-8:1217539:1218844 [2] NCCL INFO Channel 10/64 : 0
+lshn-qs-pjul-8:1217539:1218844 [2] NCCL INFO Channel 11/64 : 0
+lshn-qs-pjul-8:1217539:1218844 [2] NCCL INFO Channel 12/64 : 0
+lshn-qs-pjul-8:1217539:1218844 [2] NCCL INFO Channel 13/64 : 0
+lshn-qs-pjul-8:1217539:1218844 [2] NCCL INFO Channel 14/64 : 0
+lshn-qs-pjul-8:1217539:1218844 [2] NCCL INFO Channel 15/64 : 0
+lshn-qs-pjul-8:1217539:1218844 [2] NCCL INFO Channel 16/64 : 0
+lshn-qs-pjul-8:1217539:1218844 [2] NCCL INFO Channel 17/64 : 0
+lshn-qs-pjul-8:1217539:1218844 [2] NCCL INFO Channel 18/64 : 0
+lshn-qs-pjul-8:1217539:1218844 [2] NCCL INFO Channel 19/64 : 0
+lshn-qs-pjul-8:1217539:1218844 [2] NCCL INFO Channel 20/64 : 0
+lshn-qs-pjul-8:1217539:1218844 [2] NCCL INFO Channel 21/64 : 0
+lshn-qs-pjul-8:1217539:1218844 [2] NCCL INFO Channel 22/64 : 0
+lshn-qs-pjul-8:1217539:1218844 [2] NCCL INFO Channel 23/64 : 0
+lshn-qs-pjul-8:1217539:1218844 [2] NCCL INFO Channel 24/64 : 0
+lshn-qs-pjul-8:1217539:1218844 [2] NCCL INFO Channel 25/64 : 0
+lshn-qs-pjul-8:1217539:1218844 [2] NCCL INFO Channel 26/64 : 0
+lshn-qs-pjul-8:1217539:1218844 [2] NCCL INFO Channel 27/64 : 0
+lshn-qs-pjul-8:1217539:1218844 [2] NCCL INFO Channel 28/64 : 0
+lshn-qs-pjul-8:1217539:1218844 [2] NCCL INFO Channel 29/64 : 0
+lshn-qs-pjul-8:1217539:1218844 [2] NCCL INFO Channel 30/64 : 0
+lshn-qs-pjul-8:1217539:1218844 [2] NCCL INFO Channel 31/64 : 0
+lshn-qs-pjul-8:1217539:1218844 [2] NCCL INFO Channel 32/64 : 0
+lshn-qs-pjul-8:1217539:1218844 [2] NCCL INFO Channel 33/64 : 0
+lshn-qs-pjul-8:1217539:1218844 [2] NCCL INFO Channel 34/64 : 0
+lshn-qs-pjul-8:1217539:1218844 [2] NCCL INFO Channel 35/64 : 0
+lshn-qs-pjul-8:1217539:1218844 [2] NCCL INFO Channel 36/64 : 0
+lshn-qs-pjul-8:1217539:1218844 [2] NCCL INFO Channel 37/64 : 0
+lshn-qs-pjul-8:1217539:1218844 [2] NCCL INFO Channel 38/64 : 0
+lshn-qs-pjul-8:1217539:1218844 [2] NCCL INFO Channel 39/64 : 0
+lshn-qs-pjul-8:1217539:1218844 [2] NCCL INFO Channel 40/64 : 0
+lshn-qs-pjul-8:1217539:1218844 [2] NCCL INFO Channel 41/64 : 0
+lshn-qs-pjul-8:1217539:1218844 [2] NCCL INFO Channel 42/64 : 0
+lshn-qs-pjul-8:1217539:1218844 [2] NCCL INFO Channel 43/64 : 0
+lshn-qs-pjul-8:1217539:1218844 [2] NCCL INFO Channel 44/64 : 0
+lshn-qs-pjul-8:1217539:1218844 [2] NCCL INFO Channel 45/64 : 0
+lshn-qs-pjul-8:1217539:1218844 [2] NCCL INFO Channel 46/64 : 0
+lshn-qs-pjul-8:1217539:1218844 [2] NCCL INFO Channel 47/64 : 0
+lshn-qs-pjul-8:1217539:1218844 [2] NCCL INFO Channel 48/64 : 0
+lshn-qs-pjul-8:1217539:1218844 [2] NCCL INFO Channel 49/64 : 0
+lshn-qs-pjul-8:1217539:1218844 [2] NCCL INFO Channel 50/64 : 0
+lshn-qs-pjul-8:1217539:1218844 [2] NCCL INFO Channel 51/64 : 0
+lshn-qs-pjul-8:1217539:1218844 [2] NCCL INFO Channel 52/64 : 0
+lshn-qs-pjul-8:1217539:1218844 [2] NCCL INFO Channel 53/64 : 0
+lshn-qs-pjul-8:1217539:1218844 [2] NCCL INFO Channel 54/64 : 0
+lshn-qs-pjul-8:1217539:1218844 [2] NCCL INFO Channel 55/64 : 0
+lshn-qs-pjul-8:1217539:1218844 [2] NCCL INFO Channel 56/64 : 0
+lshn-qs-pjul-8:1217539:1218844 [2] NCCL INFO Channel 57/64 : 0
+lshn-qs-pjul-8:1217539:1218844 [2] NCCL INFO Channel 58/64 : 0
+lshn-qs-pjul-8:1217539:1218844 [2] NCCL INFO Channel 59/64 : 0
+lshn-qs-pjul-8:1217539:1218844 [2] NCCL INFO Channel 60/64 : 0
+lshn-qs-pjul-8:1217539:1218844 [2] NCCL INFO Channel 61/64 : 0
+lshn-qs-pjul-8:1217539:1218844 [2] NCCL INFO Channel 62/64 : 0
+lshn-qs-pjul-8:1217539:1218844 [2] NCCL INFO Channel 63/64 : 0
+lshn-qs-pjul-8:1217539:1218844 [2] NCCL INFO Trees [0] -1/-1/-1->0->-1 [1] -1/-1/-1->0->-1 [2] -1/-1/-1->0->-1 [3] -1/-1/-1->0->-1 [4] -1/-1/-1->0->-1 [5] -1/-1/-1->0->-1 [6] -1/-1/-1->0->-1 [7] -1/-1/-1->0->-1 [8] -1/-1/-1->0->-1 [9] -1/-1/-1->0->-1 [10] -1/-1/-1->0->-1 [11] -1/-1/-1->0->-1 [12] -1/-1/-1->0->-1 [13] -1/-1/-1->0->-1 [14] -1/-1/-1->0->-1 [15] -1/-1/-1->0->-1 [16] -1/-1/-1->0->-1 [17] -1/-1/-1->0->-1 [18] -1/-1/-1->0->-1 [19] -1/-1/-1->0->-1 [20] -1/-1/-1->0->-1 [21] -1/-1/-1->0->-1 [22] -1/-1/-1->0->-1 [23] -1/-1/-1->0->-1 [24] -1/-1/-1->0->-1 [25] -1/-1/-1->0->-1 [26] -1/-1/-1->0->-1 [27] -1/-1/-1->0->-1 [28] -1/-1/-1->0->-1 [29] -1/-1/-1->0->-1 [30] -1/-1/-1->0->-1 [31] -1/-1/-1->0->-1 [32] -1/-1/-1->0->-1 [33] -1/-1/-1->0->-1 [34] -1/-1/-1->0->-1 [35] -1/-1/-1->0->-1 [36] -1/-1/-1->0->-1 [37] -1/-1/-1->0->-1 [38] -1/-1/-1->0->-1 [39] -1/-1/-1->0->-1 [40] -1/-1/-1->0->-1 [41] -1/-1/-1->0->-1 [42] -1/-1/-1->0->-1 [43] -1/-1/-1->0->-1 [44] -1/-1/-1->0->-1 [45] -1/-1/-1->0->-1 [46] -1/-1/-1->0
+lshn-qs-pjul-8:1217539:1218844 [2] NCCL INFO P2P Chunksize set to 524288
+lshn-qs-pjul-8:1217539:1218844 [2] NCCL INFO Check P2P Type isAllDirectP2p 0 directMode 0
+lshn-qs-pjul-8:1217539:1218848 [2] NCCL INFO [Proxy Service] Device 2 CPU core 153
+lshn-qs-pjul-8:1217539:1218849 [2] NCCL INFO [Proxy Service UDS] Device 2 CPU core 156
+lshn-qs-pjul-8:1217539:1218844 [2] NCCL INFO 64 coll channels, 64 collnet channels, 0 nvls channels, 64 p2p channels, 64 p2p channels per peer
+lshn-qs-pjul-8:1217539:1218844 [2] NCCL INFO CC Off, workFifoBytes 1048576
+lshn-qs-pjul-8:1217539:1218844 [2] NCCL INFO ncclCommSplit comm 0x1ee8fe50 rank 0 nranks 1 cudaDev 2 nvmlDev 6 busId 1a3000 parent 0x1dced410 splitCount 6 color 1227022723 key 0 - Init COMPLETE
+lshn-qs-pjul-8:1217539:1218844 [2] NCCL INFO Init timings - ncclCommSplit: rank 0 nranks 1 total 0.03 (kernels 0.00, alloc 0.00, bootstrap 0.00, allgathers 0.00, topo 0.01, graphs 0.00, connections 0.02, rest 0.00)
+[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
+lshn-qs-pjul-8:1217539:1217539 [2] NCCL INFO Rank 2 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-pjul-8:1217539:1217539 [2] NCCL INFO Rank 2 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-pjul-8:1217538:1217538 [1] NCCL INFO Rank 1 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-pjul-8:1217537:1217537 [0] NCCL INFO Rank 0 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-pjul-8:1217540:1217540 [3] NCCL INFO Comm config Blocking set to 1
+lshn-qs-pjul-8:1217540:1218859 [3] NCCL INFO Assigned NET plugin Socket to comm
+lshn-qs-pjul-8:1217540:1218859 [3] NCCL INFO Using network Socket
+lshn-qs-pjul-8:1217538:1217538 [1] NCCL INFO Rank 1 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-pjul-8:1217539:1217539 [2] NCCL INFO Rank 2 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-pjul-8:1217540:1218859 [3] NCCL INFO ncclCommSplit comm 0x202cdd80 rank 0 nranks 1 cudaDev 3 nvmlDev 7 busId 1c7000 parent 0x1f12a6e0 splitCount 8 color 1301067556 key 0- Init START
+lshn-qs-pjul-8:1217537:1217537 [0] NCCL INFO Comm config Blocking set to 1
+lshn-qs-pjul-8:1217540:1218859 [3] NCCL INFO MNNVL busId 0x1c7000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0
+lshn-qs-pjul-8:1217540:1218859 [3] NCCL INFO Setting affinity for GPU 7 to 48-95,144-191
+lshn-qs-pjul-8:1217540:1218859 [3] NCCL INFO comm 0x202cdd80 rank 0 nRanks 1 nNodes 1 localRanks 1 localRank 0 MNNVL 0
+lshn-qs-pjul-8:1217540:1218859 [3] NCCL INFO Channel 00/64 : 0
+lshn-qs-pjul-8:1217540:1218859 [3] NCCL INFO Channel 01/64 : 0
+lshn-qs-pjul-8:1217540:1218859 [3] NCCL INFO Channel 02/64 : 0
+lshn-qs-pjul-8:1217540:1218859 [3] NCCL INFO Channel 03/64 : 0
+lshn-qs-pjul-8:1217540:1218859 [3] NCCL INFO Channel 04/64 : 0
+lshn-qs-pjul-8:1217540:1218859 [3] NCCL INFO Channel 05/64 : 0
+lshn-qs-pjul-8:1217540:1218859 [3] NCCL INFO Channel 06/64 : 0
+lshn-qs-pjul-8:1217540:1218859 [3] NCCL INFO Channel 07/64 : 0
+lshn-qs-pjul-8:1217540:1218859 [3] NCCL INFO Channel 08/64 : 0
+lshn-qs-pjul-8:1217540:1218859 [3] NCCL INFO Channel 09/64 : 0
+lshn-qs-pjul-8:1217540:1218859 [3] NCCL INFO Channel 10/64 : 0
+lshn-qs-pjul-8:1217540:1218859 [3] NCCL INFO Channel 11/64 : 0
+lshn-qs-pjul-8:1217540:1218859 [3] NCCL INFO Channel 12/64 : 0
+lshn-qs-pjul-8:1217540:1218859 [3] NCCL INFO Channel 13/64 : 0
+lshn-qs-pjul-8:1217540:1218859 [3] NCCL INFO Channel 14/64 : 0
+lshn-qs-pjul-8:1217540:1218859 [3] NCCL INFO Channel 15/64 : 0
+lshn-qs-pjul-8:1217540:1218859 [3] NCCL INFO Channel 16/64 : 0
+lshn-qs-pjul-8:1217540:1218859 [3] NCCL INFO Channel 17/64 : 0
+lshn-qs-pjul-8:1217540:1218859 [3] NCCL INFO Channel 18/64 : 0
+lshn-qs-pjul-8:1217540:1218859 [3] NCCL INFO Channel 19/64 : 0
+lshn-qs-pjul-8:1217540:1218859 [3] NCCL INFO Channel 20/64 : 0
+lshn-qs-pjul-8:1217540:1218859 [3] NCCL INFO Channel 21/64 : 0
+lshn-qs-pjul-8:1217540:1218859 [3] NCCL INFO Channel 22/64 : 0
+lshn-qs-pjul-8:1217540:1218859 [3] NCCL INFO Channel 23/64 : 0
+lshn-qs-pjul-8:1217540:1218859 [3] NCCL INFO Channel 24/64 : 0
+lshn-qs-pjul-8:1217540:1218859 [3] NCCL INFO Channel 25/64 : 0
+lshn-qs-pjul-8:1217540:1218859 [3] NCCL INFO Channel 26/64 : 0
+lshn-qs-pjul-8:1217540:1218859 [3] NCCL INFO Channel 27/64 : 0
+lshn-qs-pjul-8:1217540:1218859 [3] NCCL INFO Channel 28/64 : 0
+lshn-qs-pjul-8:1217540:1218859 [3] NCCL INFO Channel 29/64 : 0
+lshn-qs-pjul-8:1217540:1218859 [3] NCCL INFO Channel 30/64 : 0
+lshn-qs-pjul-8:1217540:1218859 [3] NCCL INFO Channel 31/64 : 0
+lshn-qs-pjul-8:1217540:1218859 [3] NCCL INFO Channel 32/64 : 0
+lshn-qs-pjul-8:1217540:1218859 [3] NCCL INFO Channel 33/64 : 0
+lshn-qs-pjul-8:1217540:1218859 [3] NCCL INFO Channel 34/64 : 0
+lshn-qs-pjul-8:1217540:1218859 [3] NCCL INFO Channel 35/64 : 0
+lshn-qs-pjul-8:1217540:1218859 [3] NCCL INFO Channel 36/64 : 0
+lshn-qs-pjul-8:1217540:1218859 [3] NCCL INFO Channel 37/64 : 0
+lshn-qs-pjul-8:1217540:1218859 [3] NCCL INFO Channel 38/64 : 0
+lshn-qs-pjul-8:1217540:1218859 [3] NCCL INFO Channel 39/64 : 0
+lshn-qs-pjul-8:1217540:1218859 [3] NCCL INFO Channel 40/64 : 0
+lshn-qs-pjul-8:1217540:1218859 [3] NCCL INFO Channel 41/64 : 0
+lshn-qs-pjul-8:1217540:1218859 [3] NCCL INFO Channel 42/64 : 0
+lshn-qs-pjul-8:1217540:1218859 [3] NCCL INFO Channel 43/64 : 0
+lshn-qs-pjul-8:1217540:1218859 [3] NCCL INFO Channel 44/64 : 0
+lshn-qs-pjul-8:1217540:1218859 [3] NCCL INFO Channel 45/64 : 0
+lshn-qs-pjul-8:1217540:1218859 [3] NCCL INFO Channel 46/64 : 0
+lshn-qs-pjul-8:1217540:1218859 [3] NCCL INFO Channel 47/64 : 0
+lshn-qs-pjul-8:1217540:1218859 [3] NCCL INFO Channel 48/64 : 0
+lshn-qs-pjul-8:1217540:1218859 [3] NCCL INFO Channel 49/64 : 0
+lshn-qs-pjul-8:1217540:1218859 [3] NCCL INFO Channel 50/64 : 0
+lshn-qs-pjul-8:1217540:1218859 [3] NCCL INFO Channel 51/64 : 0
+lshn-qs-pjul-8:1217540:1218859 [3] NCCL INFO Channel 52/64 : 0
+lshn-qs-pjul-8:1217540:1218859 [3] NCCL INFO Channel 53/64 : 0
+lshn-qs-pjul-8:1217540:1218859 [3] NCCL INFO Channel 54/64 : 0
+lshn-qs-pjul-8:1217540:1218859 [3] NCCL INFO Channel 55/64 : 0
+lshn-qs-pjul-8:1217540:1218859 [3] NCCL INFO Channel 56/64 : 0
+lshn-qs-pjul-8:1217540:1218859 [3] NCCL INFO Channel 57/64 : 0
+lshn-qs-pjul-8:1217540:1218859 [3] NCCL INFO Channel 58/64 : 0
+lshn-qs-pjul-8:1217540:1218859 [3] NCCL INFO Channel 59/64 : 0
+lshn-qs-pjul-8:1217540:1218859 [3] NCCL INFO Channel 60/64 : 0
+lshn-qs-pjul-8:1217540:1218859 [3] NCCL INFO Channel 61/64 : 0
+lshn-qs-pjul-8:1217540:1218859 [3] NCCL INFO Channel 62/64 : 0
+lshn-qs-pjul-8:1217540:1218859 [3] NCCL INFO Channel 63/64 : 0
+lshn-qs-pjul-8:1217540:1218859 [3] NCCL INFO Trees [0] -1/-1/-1->0->-1 [1] -1/-1/-1->0->-1 [2] -1/-1/-1->0->-1 [3] -1/-1/-1->0->-1 [4] -1/-1/-1->0->-1 [5] -1/-1/-1->0->-1 [6] -1/-1/-1->0->-1 [7] -1/-1/-1->0->-1 [8] -1/-1/-1->0->-1 [9] -1/-1/-1->0->-1 [10] -1/-1/-1->0->-1 [11] -1/-1/-1->0->-1 [12] -1/-1/-1->0->-1 [13] -1/-1/-1->0->-1 [14] -1/-1/-1->0->-1 [15] -1/-1/-1->0->-1 [16] -1/-1/-1->0->-1 [17] -1/-1/-1->0->-1 [18] -1/-1/-1->0->-1 [19] -1/-1/-1->0->-1 [20] -1/-1/-1->0->-1 [21] -1/-1/-1->0->-1 [22] -1/-1/-1->0->-1 [23] -1/-1/-1->0->-1 [24] -1/-1/-1->0->-1 [25] -1/-1/-1->0->-1 [26] -1/-1/-1->0->-1 [27] -1/-1/-1->0->-1 [28] -1/-1/-1->0->-1 [29] -1/-1/-1->0->-1 [30] -1/-1/-1->0->-1 [31] -1/-1/-1->0->-1 [32] -1/-1/-1->0->-1 [33] -1/-1/-1->0->-1 [34] -1/-1/-1->0->-1 [35] -1/-1/-1->0->-1 [36] -1/-1/-1->0->-1 [37] -1/-1/-1->0->-1 [38] -1/-1/-1->0->-1 [39] -1/-1/-1->0->-1 [40] -1/-1/-1->0->-1 [41] -1/-1/-1->0->-1 [42] -1/-1/-1->0->-1 [43] -1/-1/-1->0->-1 [44] -1/-1/-1->0->-1 [45] -1/-1/-1->0->-1 [46] -1/-1/-1->0
+lshn-qs-pjul-8:1217540:1218859 [3] NCCL INFO P2P Chunksize set to 524288
+lshn-qs-pjul-8:1217540:1218859 [3] NCCL INFO Check P2P Type isAllDirectP2p 0 directMode 0
+lshn-qs-pjul-8:1217540:1218865 [3] NCCL INFO [Proxy Service] Device 3 CPU core 172
+lshn-qs-pjul-8:1217540:1218866 [3] NCCL INFO [Proxy Service UDS] Device 3 CPU core 181
+lshn-qs-pjul-8:1217540:1218859 [3] NCCL INFO 64 coll channels, 64 collnet channels, 0 nvls channels, 64 p2p channels, 64 p2p channels per peer
+lshn-qs-pjul-8:1217540:1218859 [3] NCCL INFO CC Off, workFifoBytes 1048576
+lshn-qs-pjul-8:1217540:1218859 [3] NCCL INFO ncclCommSplit comm 0x202cdd80 rank 0 nranks 1 cudaDev 3 nvmlDev 7 busId 1c7000 parent 0x1f12a6e0 splitCount 8 color 1301067556 key 0 - Init COMPLETE
+lshn-qs-pjul-8:1217540:1218859 [3] NCCL INFO Init timings - ncclCommSplit: rank 0 nranks 1 total 0.03 (kernels 0.00, alloc 0.00, bootstrap 0.00, allgathers 0.00, topo 0.01, graphs 0.00, connections 0.02, rest 0.00)
+[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
+lshn-qs-pjul-8:1217540:1217540 [3] NCCL INFO Rank 3 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-pjul-8:1217537:1218864 [0] NCCL INFO Assigned NET plugin Socket to comm
+lshn-qs-pjul-8:1217537:1218864 [0] NCCL INFO Using network Socket
+lshn-qs-pjul-8:1217540:1217540 [3] NCCL INFO Rank 3 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-pjul-8:1217538:1217538 [1] NCCL INFO Rank 1 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-pjul-8:1217539:1217539 [2] NCCL INFO Rank 2 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-pjul-8:1217537:1218864 [0] NCCL INFO ncclCommSplit comm 0x1fdc0650 rank 0 nranks 1 cudaDev 0 nvmlDev 4 busId 109000 parent 0x1ce20510 splitCount 9 color 59908776 key 0- Init START
+lshn-qs-pjul-8:1217537:1218864 [0] NCCL INFO MNNVL busId 0x109000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0
+lshn-qs-pjul-8:1217537:1218864 [0] NCCL INFO Setting affinity for GPU 4 to 48-95,144-191
+lshn-qs-pjul-8:1217537:1218864 [0] NCCL INFO comm 0x1fdc0650 rank 0 nRanks 1 nNodes 1 localRanks 1 localRank 0 MNNVL 0
+lshn-qs-pjul-8:1217537:1218864 [0] NCCL INFO Channel 00/64 : 0
+lshn-qs-pjul-8:1217537:1218864 [0] NCCL INFO Channel 01/64 : 0
+lshn-qs-pjul-8:1217537:1218864 [0] NCCL INFO Channel 02/64 : 0
+lshn-qs-pjul-8:1217537:1218864 [0] NCCL INFO Channel 03/64 : 0
+lshn-qs-pjul-8:1217537:1218864 [0] NCCL INFO Channel 04/64 : 0
+lshn-qs-pjul-8:1217537:1218864 [0] NCCL INFO Channel 05/64 : 0
+lshn-qs-pjul-8:1217537:1218864 [0] NCCL INFO Channel 06/64 : 0
+lshn-qs-pjul-8:1217537:1218864 [0] NCCL INFO Channel 07/64 : 0
+lshn-qs-pjul-8:1217537:1218864 [0] NCCL INFO Channel 08/64 : 0
+lshn-qs-pjul-8:1217537:1218864 [0] NCCL INFO Channel 09/64 : 0
+lshn-qs-pjul-8:1217537:1218864 [0] NCCL INFO Channel 10/64 : 0
+lshn-qs-pjul-8:1217537:1218864 [0] NCCL INFO Channel 11/64 : 0
+lshn-qs-pjul-8:1217537:1218864 [0] NCCL INFO Channel 12/64 : 0
+lshn-qs-pjul-8:1217537:1218864 [0] NCCL INFO Channel 13/64 : 0
+lshn-qs-pjul-8:1217537:1218864 [0] NCCL INFO Channel 14/64 : 0
+lshn-qs-pjul-8:1217537:1218864 [0] NCCL INFO Channel 15/64 : 0
+lshn-qs-pjul-8:1217537:1218864 [0] NCCL INFO Channel 16/64 : 0
+lshn-qs-pjul-8:1217537:1218864 [0] NCCL INFO Channel 17/64 : 0
+lshn-qs-pjul-8:1217537:1218864 [0] NCCL INFO Channel 18/64 : 0
+lshn-qs-pjul-8:1217537:1218864 [0] NCCL INFO Channel 19/64 : 0
+lshn-qs-pjul-8:1217537:1218864 [0] NCCL INFO Channel 20/64 : 0
+lshn-qs-pjul-8:1217537:1218864 [0] NCCL INFO Channel 21/64 : 0
+lshn-qs-pjul-8:1217537:1218864 [0] NCCL INFO Channel 22/64 : 0
+lshn-qs-pjul-8:1217537:1218864 [0] NCCL INFO Channel 23/64 : 0
+lshn-qs-pjul-8:1217537:1218864 [0] NCCL INFO Channel 24/64 : 0
+lshn-qs-pjul-8:1217537:1218864 [0] NCCL INFO Channel 25/64 : 0
+lshn-qs-pjul-8:1217537:1218864 [0] NCCL INFO Channel 26/64 : 0
+lshn-qs-pjul-8:1217537:1218864 [0] NCCL INFO Channel 27/64 : 0
+lshn-qs-pjul-8:1217537:1218864 [0] NCCL INFO Channel 28/64 : 0
+lshn-qs-pjul-8:1217537:1218864 [0] NCCL INFO Channel 29/64 : 0
+lshn-qs-pjul-8:1217537:1218864 [0] NCCL INFO Channel 30/64 : 0
+lshn-qs-pjul-8:1217537:1218864 [0] NCCL INFO Channel 31/64 : 0
+lshn-qs-pjul-8:1217537:1218864 [0] NCCL INFO Channel 32/64 : 0
+lshn-qs-pjul-8:1217537:1218864 [0] NCCL INFO Channel 33/64 : 0
+lshn-qs-pjul-8:1217537:1218864 [0] NCCL INFO Channel 34/64 : 0
+lshn-qs-pjul-8:1217537:1218864 [0] NCCL INFO Channel 35/64 : 0
+lshn-qs-pjul-8:1217537:1218864 [0] NCCL INFO Channel 36/64 : 0
+lshn-qs-pjul-8:1217537:1218864 [0] NCCL INFO Channel 37/64 : 0
+lshn-qs-pjul-8:1217537:1218864 [0] NCCL INFO Channel 38/64 : 0
+lshn-qs-pjul-8:1217537:1218864 [0] NCCL INFO Channel 39/64 : 0
+lshn-qs-pjul-8:1217537:1218864 [0] NCCL INFO Channel 40/64 : 0
+lshn-qs-pjul-8:1217537:1218864 [0] NCCL INFO Channel 41/64 : 0
+lshn-qs-pjul-8:1217537:1218864 [0] NCCL INFO Channel 42/64 : 0
+lshn-qs-pjul-8:1217537:1218864 [0] NCCL INFO Channel 43/64 : 0
+lshn-qs-pjul-8:1217537:1218864 [0] NCCL INFO Channel 44/64 : 0
+lshn-qs-pjul-8:1217537:1218864 [0] NCCL INFO Channel 45/64 : 0
+lshn-qs-pjul-8:1217537:1218864 [0] NCCL INFO Channel 46/64 : 0
+lshn-qs-pjul-8:1217537:1218864 [0] NCCL INFO Channel 47/64 : 0
+lshn-qs-pjul-8:1217537:1218864 [0] NCCL INFO Channel 48/64 : 0
+lshn-qs-pjul-8:1217537:1218864 [0] NCCL INFO Channel 49/64 : 0
+lshn-qs-pjul-8:1217537:1218864 [0] NCCL INFO Channel 50/64 : 0
+lshn-qs-pjul-8:1217537:1218864 [0] NCCL INFO Channel 51/64 : 0
+lshn-qs-pjul-8:1217537:1218864 [0] NCCL INFO Channel 52/64 : 0
+lshn-qs-pjul-8:1217537:1218864 [0] NCCL INFO Channel 53/64 : 0
+lshn-qs-pjul-8:1217537:1218864 [0] NCCL INFO Channel 54/64 : 0
+lshn-qs-pjul-8:1217537:1218864 [0] NCCL INFO Channel 55/64 : 0
+lshn-qs-pjul-8:1217537:1218864 [0] NCCL INFO Channel 56/64 : 0
+lshn-qs-pjul-8:1217537:1218864 [0] NCCL INFO Channel 57/64 : 0
+lshn-qs-pjul-8:1217537:1218864 [0] NCCL INFO Channel 58/64 : 0
+lshn-qs-pjul-8:1217537:1218864 [0] NCCL INFO Channel 59/64 : 0
+lshn-qs-pjul-8:1217537:1218864 [0] NCCL INFO Channel 60/64 : 0
+lshn-qs-pjul-8:1217537:1218864 [0] NCCL INFO Channel 61/64 : 0
+lshn-qs-pjul-8:1217537:1218864 [0] NCCL INFO Channel 62/64 : 0
+lshn-qs-pjul-8:1217537:1218864 [0] NCCL INFO Channel 63/64 : 0
+lshn-qs-pjul-8:1217537:1218864 [0] NCCL INFO Trees [0] -1/-1/-1->0->-1 [1] -1/-1/-1->0->-1 [2] -1/-1/-1->0->-1 [3] -1/-1/-1->0->-1 [4] -1/-1/-1->0->-1 [5] -1/-1/-1->0->-1 [6] -1/-1/-1->0->-1 [7] -1/-1/-1->0->-1 [8] -1/-1/-1->0->-1 [9] -1/-1/-1->0->-1 [10] -1/-1/-1->0->-1 [11] -1/-1/-1->0->-1 [12] -1/-1/-1->0->-1 [13] -1/-1/-1->0->-1 [14] -1/-1/-1->0->-1 [15] -1/-1/-1->0->-1 [16] -1/-1/-1->0->-1 [17] -1/-1/-1->0->-1 [18] -1/-1/-1->0->-1 [19] -1/-1/-1->0->-1 [20] -1/-1/-1->0->-1 [21] -1/-1/-1->0->-1 [22] -1/-1/-1->0->-1 [23] -1/-1/-1->0->-1 [24] -1/-1/-1->0->-1 [25] -1/-1/-1->0->-1 [26] -1/-1/-1->0->-1 [27] -1/-1/-1->0->-1 [28] -1/-1/-1->0->-1 [29] -1/-1/-1->0->-1 [30] -1/-1/-1->0->-1 [31] -1/-1/-1->0->-1 [32] -1/-1/-1->0->-1 [33] -1/-1/-1->0->-1 [34] -1/-1/-1->0->-1 [35] -1/-1/-1->0->-1 [36] -1/-1/-1->0->-1 [37] -1/-1/-1->0->-1 [38] -1/-1/-1->0->-1 [39] -1/-1/-1->0->-1 [40] -1/-1/-1->0->-1 [41] -1/-1/-1->0->-1 [42] -1/-1/-1->0->-1 [43] -1/-1/-1->0->-1 [44] -1/-1/-1->0->-1 [45] -1/-1/-1->0->-1 [46] -1/-1/-1->0
+lshn-qs-pjul-8:1217537:1218864 [0] NCCL INFO P2P Chunksize set to 524288
+lshn-qs-pjul-8:1217537:1218864 [0] NCCL INFO Check P2P Type isAllDirectP2p 0 directMode 0
+lshn-qs-pjul-8:1217537:1218874 [0] NCCL INFO [Proxy Service] Device 0 CPU core 147
+lshn-qs-pjul-8:1217537:1218875 [0] NCCL INFO [Proxy Service UDS] Device 0 CPU core 52
+lshn-qs-pjul-8:1217537:1218864 [0] NCCL INFO 64 coll channels, 64 collnet channels, 0 nvls channels, 64 p2p channels, 64 p2p channels per peer
+lshn-qs-pjul-8:1217537:1218864 [0] NCCL INFO CC Off, workFifoBytes 1048576
+lshn-qs-pjul-8:1217537:1218864 [0] NCCL INFO ncclCommSplit comm 0x1fdc0650 rank 0 nranks 1 cudaDev 0 nvmlDev 4 busId 109000 parent 0x1ce20510 splitCount 9 color 59908776 key 0 - Init COMPLETE
+lshn-qs-pjul-8:1217537:1218864 [0] NCCL INFO Init timings - ncclCommSplit: rank 0 nranks 1 total 0.09 (kernels 0.00, alloc 0.00, bootstrap 0.00, allgathers 0.00, topo 0.01, graphs 0.00, connections 0.02, rest 0.06)
+[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
+lshn-qs-pjul-8:1217537:1217537 [0] NCCL INFO Rank 0 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-pjul-8:1217537:1217537 [0] NCCL INFO Rank 0 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-pjul-8:1217540:1217540 [3] NCCL INFO Rank 3 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-pjul-8:1217538:1217538 [1] NCCL INFO Comm config Blocking set to 1
+lshn-qs-pjul-8:1217539:1217539 [2] NCCL INFO Rank 2 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-pjul-8:1217538:1218884 [1] NCCL INFO Assigned NET plugin Socket to comm
+lshn-qs-pjul-8:1217538:1218884 [1] NCCL INFO Using network Socket
+lshn-qs-pjul-8:1217540:1217540 [3] NCCL INFO Rank 3 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-pjul-8:1217537:1217537 [0] NCCL INFO Rank 0 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-pjul-8:1217539:1217539 [2] NCCL INFO Rank 2 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-pjul-8:1217538:1218884 [1] NCCL INFO ncclCommSplit comm 0x216803c0 rank 0 nranks 1 cudaDev 1 nvmlDev 5 busId 17f000 parent 0x1ed808d0 splitCount 11 color 440515407 key 0- Init START
+lshn-qs-pjul-8:1217538:1218884 [1] NCCL INFO MNNVL busId 0x17f000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0
+lshn-qs-pjul-8:1217538:1218884 [1] NCCL INFO Setting affinity for GPU 5 to 48-95,144-191
+lshn-qs-pjul-8:1217538:1218884 [1] NCCL INFO comm 0x216803c0 rank 0 nRanks 1 nNodes 1 localRanks 1 localRank 0 MNNVL 0
+lshn-qs-pjul-8:1217538:1218884 [1] NCCL INFO Channel 00/64 : 0
+lshn-qs-pjul-8:1217538:1218884 [1] NCCL INFO Channel 01/64 : 0
+lshn-qs-pjul-8:1217538:1218884 [1] NCCL INFO Channel 02/64 : 0
+lshn-qs-pjul-8:1217538:1218884 [1] NCCL INFO Channel 03/64 : 0
+lshn-qs-pjul-8:1217538:1218884 [1] NCCL INFO Channel 04/64 : 0
+lshn-qs-pjul-8:1217538:1218884 [1] NCCL INFO Channel 05/64 : 0
+lshn-qs-pjul-8:1217538:1218884 [1] NCCL INFO Channel 06/64 : 0
+lshn-qs-pjul-8:1217538:1218884 [1] NCCL INFO Channel 07/64 : 0
+lshn-qs-pjul-8:1217538:1218884 [1] NCCL INFO Channel 08/64 : 0
+lshn-qs-pjul-8:1217538:1218884 [1] NCCL INFO Channel 09/64 : 0
+lshn-qs-pjul-8:1217538:1218884 [1] NCCL INFO Channel 10/64 : 0
+lshn-qs-pjul-8:1217538:1218884 [1] NCCL INFO Channel 11/64 : 0
+lshn-qs-pjul-8:1217538:1218884 [1] NCCL INFO Channel 12/64 : 0
+lshn-qs-pjul-8:1217538:1218884 [1] NCCL INFO Channel 13/64 : 0
+lshn-qs-pjul-8:1217538:1218884 [1] NCCL INFO Channel 14/64 : 0
+lshn-qs-pjul-8:1217538:1218884 [1] NCCL INFO Channel 15/64 : 0
+lshn-qs-pjul-8:1217538:1218884 [1] NCCL INFO Channel 16/64 : 0
+lshn-qs-pjul-8:1217538:1218884 [1] NCCL INFO Channel 17/64 : 0
+lshn-qs-pjul-8:1217538:1218884 [1] NCCL INFO Channel 18/64 : 0
+lshn-qs-pjul-8:1217538:1218884 [1] NCCL INFO Channel 19/64 : 0
+lshn-qs-pjul-8:1217538:1218884 [1] NCCL INFO Channel 20/64 : 0
+lshn-qs-pjul-8:1217538:1218884 [1] NCCL INFO Channel 21/64 : 0
+lshn-qs-pjul-8:1217538:1218884 [1] NCCL INFO Channel 22/64 : 0
+lshn-qs-pjul-8:1217538:1218884 [1] NCCL INFO Channel 23/64 : 0
+lshn-qs-pjul-8:1217538:1218884 [1] NCCL INFO Channel 24/64 : 0
+lshn-qs-pjul-8:1217538:1218884 [1] NCCL INFO Channel 25/64 : 0
+lshn-qs-pjul-8:1217538:1218884 [1] NCCL INFO Channel 26/64 : 0
+lshn-qs-pjul-8:1217538:1218884 [1] NCCL INFO Channel 27/64 : 0
+lshn-qs-pjul-8:1217538:1218884 [1] NCCL INFO Channel 28/64 : 0
+lshn-qs-pjul-8:1217538:1218884 [1] NCCL INFO Channel 29/64 : 0
+lshn-qs-pjul-8:1217538:1218884 [1] NCCL INFO Channel 30/64 : 0
+lshn-qs-pjul-8:1217538:1218884 [1] NCCL INFO Channel 31/64 : 0
+lshn-qs-pjul-8:1217538:1218884 [1] NCCL INFO Channel 32/64 : 0
+lshn-qs-pjul-8:1217538:1218884 [1] NCCL INFO Channel 33/64 : 0
+lshn-qs-pjul-8:1217538:1218884 [1] NCCL INFO Channel 34/64 : 0
+lshn-qs-pjul-8:1217538:1218884 [1] NCCL INFO Channel 35/64 : 0
+lshn-qs-pjul-8:1217538:1218884 [1] NCCL INFO Channel 36/64 : 0
+lshn-qs-pjul-8:1217538:1218884 [1] NCCL INFO Channel 37/64 : 0
+lshn-qs-pjul-8:1217538:1218884 [1] NCCL INFO Channel 38/64 : 0
+lshn-qs-pjul-8:1217538:1218884 [1] NCCL INFO Channel 39/64 : 0
+lshn-qs-pjul-8:1217538:1218884 [1] NCCL INFO Channel 40/64 : 0
+lshn-qs-pjul-8:1217538:1218884 [1] NCCL INFO Channel 41/64 : 0
+lshn-qs-pjul-8:1217538:1218884 [1] NCCL INFO Channel 42/64 : 0
+lshn-qs-pjul-8:1217538:1218884 [1] NCCL INFO Channel 43/64 : 0
+lshn-qs-pjul-8:1217538:1218884 [1] NCCL INFO Channel 44/64 : 0
+lshn-qs-pjul-8:1217538:1218884 [1] NCCL INFO Channel 45/64 : 0
+lshn-qs-pjul-8:1217538:1218884 [1] NCCL INFO Channel 46/64 : 0
+lshn-qs-pjul-8:1217538:1218884 [1] NCCL INFO Channel 47/64 : 0
+lshn-qs-pjul-8:1217538:1218884 [1] NCCL INFO Channel 48/64 : 0
+lshn-qs-pjul-8:1217538:1218884 [1] NCCL INFO Channel 49/64 : 0
+lshn-qs-pjul-8:1217538:1218884 [1] NCCL INFO Channel 50/64 : 0
+lshn-qs-pjul-8:1217538:1218884 [1] NCCL INFO Channel 51/64 : 0
+lshn-qs-pjul-8:1217538:1218884 [1] NCCL INFO Channel 52/64 : 0
+lshn-qs-pjul-8:1217538:1218884 [1] NCCL INFO Channel 53/64 : 0
+lshn-qs-pjul-8:1217538:1218884 [1] NCCL INFO Channel 54/64 : 0
+lshn-qs-pjul-8:1217538:1218884 [1] NCCL INFO Channel 55/64 : 0
+lshn-qs-pjul-8:1217538:1218884 [1] NCCL INFO Channel 56/64 : 0
+lshn-qs-pjul-8:1217538:1218884 [1] NCCL INFO Channel 57/64 : 0
+lshn-qs-pjul-8:1217538:1218884 [1] NCCL INFO Channel 58/64 : 0
+lshn-qs-pjul-8:1217538:1218884 [1] NCCL INFO Channel 59/64 : 0
+lshn-qs-pjul-8:1217538:1218884 [1] NCCL INFO Channel 60/64 : 0
+lshn-qs-pjul-8:1217538:1218884 [1] NCCL INFO Channel 61/64 : 0
+lshn-qs-pjul-8:1217538:1218884 [1] NCCL INFO Channel 62/64 : 0
+lshn-qs-pjul-8:1217538:1218884 [1] NCCL INFO Channel 63/64 : 0
+lshn-qs-pjul-8:1217538:1218884 [1] NCCL INFO Trees [0] -1/-1/-1->0->-1 [1] -1/-1/-1->0->-1 [2] -1/-1/-1->0->-1 [3] -1/-1/-1->0->-1 [4] -1/-1/-1->0->-1 [5] -1/-1/-1->0->-1 [6] -1/-1/-1->0->-1 [7] -1/-1/-1->0->-1 [8] -1/-1/-1->0->-1 [9] -1/-1/-1->0->-1 [10] -1/-1/-1->0->-1 [11] -1/-1/-1->0->-1 [12] -1/-1/-1->0->-1 [13] -1/-1/-1->0->-1 [14] -1/-1/-1->0->-1 [15] -1/-1/-1->0->-1 [16] -1/-1/-1->0->-1 [17] -1/-1/-1->0->-1 [18] -1/-1/-1->0->-1 [19] -1/-1/-1->0->-1 [20] -1/-1/-1->0->-1 [21] -1/-1/-1->0->-1 [22] -1/-1/-1->0->-1 [23] -1/-1/-1->0->-1 [24] -1/-1/-1->0->-1 [25] -1/-1/-1->0->-1 [26] -1/-1/-1->0->-1 [27] -1/-1/-1->0->-1 [28] -1/-1/-1->0->-1 [29] -1/-1/-1->0->-1 [30] -1/-1/-1->0->-1 [31] -1/-1/-1->0->-1 [32] -1/-1/-1->0->-1 [33] -1/-1/-1->0->-1 [34] -1/-1/-1->0->-1 [35] -1/-1/-1->0->-1 [36] -1/-1/-1->0->-1 [37] -1/-1/-1->0->-1 [38] -1/-1/-1->0->-1 [39] -1/-1/-1->0->-1 [40] -1/-1/-1->0->-1 [41] -1/-1/-1->0->-1 [42] -1/-1/-1->0->-1 [43] -1/-1/-1->0->-1 [44] -1/-1/-1->0->-1 [45] -1/-1/-1->0->-1 [46] -1/-1/-1->0
+lshn-qs-pjul-8:1217538:1218884 [1] NCCL INFO P2P Chunksize set to 524288
+lshn-qs-pjul-8:1217538:1218884 [1] NCCL INFO Check P2P Type isAllDirectP2p 0 directMode 0
+lshn-qs-pjul-8:1217538:1218889 [1] NCCL INFO [Proxy Service] Device 1 CPU core 65
+lshn-qs-pjul-8:1217538:1218890 [1] NCCL INFO [Proxy Service UDS] Device 1 CPU core 62
+lshn-qs-pjul-8:1217538:1218884 [1] NCCL INFO 64 coll channels, 64 collnet channels, 0 nvls channels, 64 p2p channels, 64 p2p channels per peer
+lshn-qs-pjul-8:1217538:1218884 [1] NCCL INFO CC Off, workFifoBytes 1048576
+lshn-qs-pjul-8:1217538:1218884 [1] NCCL INFO ncclCommSplit comm 0x216803c0 rank 0 nranks 1 cudaDev 1 nvmlDev 5 busId 17f000 parent 0x1ed808d0 splitCount 11 color 440515407 key 0 - Init COMPLETE
+lshn-qs-pjul-8:1217538:1218884 [1] NCCL INFO Init timings - ncclCommSplit: rank 0 nranks 1 total 0.03 (kernels 0.00, alloc 0.00, bootstrap 0.00, allgathers 0.00, topo 0.01, graphs 0.00, connections 0.02, rest 0.00)
+[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
+lshn-qs-pjul-8:1217538:1217538 [1] NCCL INFO Rank 1 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-pjul-8:1217538:1217538 [1] NCCL INFO Rank 1 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-pjul-8:1217540:1217540 [3] NCCL INFO Rank 3 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-pjul-8:1217537:1217537 [0] NCCL INFO Rank 0 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-pjul-8:1217539:1217539 [2] NCCL INFO Comm config Blocking set to 1
+lshn-qs-pjul-8:1217539:1218900 [2] NCCL INFO Assigned NET plugin Socket to comm
+lshn-qs-pjul-8:1217539:1218900 [2] NCCL INFO Using network Socket
+lshn-qs-pjul-8:1217540:1217540 [3] NCCL INFO Rank 3 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-pjul-8:1217538:1217538 [1] NCCL INFO Rank 1 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-pjul-8:1217537:1217537 [0] NCCL INFO Rank 0 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-pjul-8:1217539:1218900 [2] NCCL INFO ncclCommSplit comm 0x205ea2f0 rank 0 nranks 1 cudaDev 2 nvmlDev 6 busId 1a3000 parent 0x1dced410 splitCount 13 color 1227022723 key 0- Init START
+lshn-qs-pjul-8:1217539:1218900 [2] NCCL INFO MNNVL busId 0x1a3000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0
+lshn-qs-pjul-8:1217539:1218900 [2] NCCL INFO Setting affinity for GPU 6 to 48-95,144-191
+lshn-qs-pjul-8:1217539:1218900 [2] NCCL INFO comm 0x205ea2f0 rank 0 nRanks 1 nNodes 1 localRanks 1 localRank 0 MNNVL 0
+lshn-qs-pjul-8:1217539:1218900 [2] NCCL INFO Channel 00/64 : 0
+lshn-qs-pjul-8:1217539:1218900 [2] NCCL INFO Channel 01/64 : 0
+lshn-qs-pjul-8:1217539:1218900 [2] NCCL INFO Channel 02/64 : 0
+lshn-qs-pjul-8:1217539:1218900 [2] NCCL INFO Channel 03/64 : 0
+lshn-qs-pjul-8:1217539:1218900 [2] NCCL INFO Channel 04/64 : 0
+lshn-qs-pjul-8:1217539:1218900 [2] NCCL INFO Channel 05/64 : 0
+lshn-qs-pjul-8:1217539:1218900 [2] NCCL INFO Channel 06/64 : 0
+lshn-qs-pjul-8:1217539:1218900 [2] NCCL INFO Channel 07/64 : 0
+lshn-qs-pjul-8:1217539:1218900 [2] NCCL INFO Channel 08/64 : 0
+lshn-qs-pjul-8:1217539:1218900 [2] NCCL INFO Channel 09/64 : 0
+lshn-qs-pjul-8:1217539:1218900 [2] NCCL INFO Channel 10/64 : 0
+lshn-qs-pjul-8:1217539:1218900 [2] NCCL INFO Channel 11/64 : 0
+lshn-qs-pjul-8:1217539:1218900 [2] NCCL INFO Channel 12/64 : 0
+lshn-qs-pjul-8:1217539:1218900 [2] NCCL INFO Channel 13/64 : 0
+lshn-qs-pjul-8:1217539:1218900 [2] NCCL INFO Channel 14/64 : 0
+lshn-qs-pjul-8:1217539:1218900 [2] NCCL INFO Channel 15/64 : 0
+lshn-qs-pjul-8:1217539:1218900 [2] NCCL INFO Channel 16/64 : 0
+lshn-qs-pjul-8:1217539:1218900 [2] NCCL INFO Channel 17/64 : 0
+lshn-qs-pjul-8:1217539:1218900 [2] NCCL INFO Channel 18/64 : 0
+lshn-qs-pjul-8:1217539:1218900 [2] NCCL INFO Channel 19/64 : 0
+lshn-qs-pjul-8:1217539:1218900 [2] NCCL INFO Channel 20/64 : 0
+lshn-qs-pjul-8:1217539:1218900 [2] NCCL INFO Channel 21/64 : 0
+lshn-qs-pjul-8:1217539:1218900 [2] NCCL INFO Channel 22/64 : 0
+lshn-qs-pjul-8:1217539:1218900 [2] NCCL INFO Channel 23/64 : 0
+lshn-qs-pjul-8:1217539:1218900 [2] NCCL INFO Channel 24/64 : 0
+lshn-qs-pjul-8:1217539:1218900 [2] NCCL INFO Channel 25/64 : 0
+lshn-qs-pjul-8:1217539:1218900 [2] NCCL INFO Channel 26/64 : 0
+lshn-qs-pjul-8:1217539:1218900 [2] NCCL INFO Channel 27/64 : 0
+lshn-qs-pjul-8:1217539:1218900 [2] NCCL INFO Channel 28/64 : 0
+lshn-qs-pjul-8:1217539:1218900 [2] NCCL INFO Channel 29/64 : 0
+lshn-qs-pjul-8:1217539:1218900 [2] NCCL INFO Channel 30/64 : 0
+lshn-qs-pjul-8:1217539:1218900 [2] NCCL INFO Channel 31/64 : 0
+lshn-qs-pjul-8:1217539:1218900 [2] NCCL INFO Channel 32/64 : 0
+lshn-qs-pjul-8:1217539:1218900 [2] NCCL INFO Channel 33/64 : 0
+lshn-qs-pjul-8:1217539:1218900 [2] NCCL INFO Channel 34/64 : 0
+lshn-qs-pjul-8:1217539:1218900 [2] NCCL INFO Channel 35/64 : 0
+lshn-qs-pjul-8:1217539:1218900 [2] NCCL INFO Channel 36/64 : 0
+lshn-qs-pjul-8:1217539:1218900 [2] NCCL INFO Channel 37/64 : 0
+lshn-qs-pjul-8:1217539:1218900 [2] NCCL INFO Channel 38/64 : 0
+lshn-qs-pjul-8:1217539:1218900 [2] NCCL INFO Channel 39/64 : 0
+lshn-qs-pjul-8:1217539:1218900 [2] NCCL INFO Channel 40/64 : 0
+lshn-qs-pjul-8:1217539:1218900 [2] NCCL INFO Channel 41/64 : 0
+lshn-qs-pjul-8:1217539:1218900 [2] NCCL INFO Channel 42/64 : 0
+lshn-qs-pjul-8:1217539:1218900 [2] NCCL INFO Channel 43/64 : 0
+lshn-qs-pjul-8:1217539:1218900 [2] NCCL INFO Channel 44/64 : 0
+lshn-qs-pjul-8:1217539:1218900 [2] NCCL INFO Channel 45/64 : 0
+lshn-qs-pjul-8:1217539:1218900 [2] NCCL INFO Channel 46/64 : 0
+lshn-qs-pjul-8:1217539:1218900 [2] NCCL INFO Channel 47/64 : 0
+lshn-qs-pjul-8:1217539:1218900 [2] NCCL INFO Channel 48/64 : 0
+lshn-qs-pjul-8:1217539:1218900 [2] NCCL INFO Channel 49/64 : 0
+lshn-qs-pjul-8:1217539:1218900 [2] NCCL INFO Channel 50/64 : 0
+lshn-qs-pjul-8:1217539:1218900 [2] NCCL INFO Channel 51/64 : 0
+lshn-qs-pjul-8:1217539:1218900 [2] NCCL INFO Channel 52/64 : 0
+lshn-qs-pjul-8:1217539:1218900 [2] NCCL INFO Channel 53/64 : 0
+lshn-qs-pjul-8:1217539:1218900 [2] NCCL INFO Channel 54/64 : 0
+lshn-qs-pjul-8:1217539:1218900 [2] NCCL INFO Channel 55/64 : 0
+lshn-qs-pjul-8:1217539:1218900 [2] NCCL INFO Channel 56/64 : 0
+lshn-qs-pjul-8:1217539:1218900 [2] NCCL INFO Channel 57/64 : 0
+lshn-qs-pjul-8:1217539:1218900 [2] NCCL INFO Channel 58/64 : 0
+lshn-qs-pjul-8:1217539:1218900 [2] NCCL INFO Channel 59/64 : 0
+lshn-qs-pjul-8:1217539:1218900 [2] NCCL INFO Channel 60/64 : 0
+lshn-qs-pjul-8:1217539:1218900 [2] NCCL INFO Channel 61/64 : 0
+lshn-qs-pjul-8:1217539:1218900 [2] NCCL INFO Channel 62/64 : 0
+lshn-qs-pjul-8:1217539:1218900 [2] NCCL INFO Channel 63/64 : 0
+lshn-qs-pjul-8:1217539:1218900 [2] NCCL INFO Trees [0] -1/-1/-1->0->-1 [1] -1/-1/-1->0->-1 [2] -1/-1/-1->0->-1 [3] -1/-1/-1->0->-1 [4] -1/-1/-1->0->-1 [5] -1/-1/-1->0->-1 [6] -1/-1/-1->0->-1 [7] -1/-1/-1->0->-1 [8] -1/-1/-1->0->-1 [9] -1/-1/-1->0->-1 [10] -1/-1/-1->0->-1 [11] -1/-1/-1->0->-1 [12] -1/-1/-1->0->-1 [13] -1/-1/-1->0->-1 [14] -1/-1/-1->0->-1 [15] -1/-1/-1->0->-1 [16] -1/-1/-1->0->-1 [17] -1/-1/-1->0->-1 [18] -1/-1/-1->0->-1 [19] -1/-1/-1->0->-1 [20] -1/-1/-1->0->-1 [21] -1/-1/-1->0->-1 [22] -1/-1/-1->0->-1 [23] -1/-1/-1->0->-1 [24] -1/-1/-1->0->-1 [25] -1/-1/-1->0->-1 [26] -1/-1/-1->0->-1 [27] -1/-1/-1->0->-1 [28] -1/-1/-1->0->-1 [29] -1/-1/-1->0->-1 [30] -1/-1/-1->0->-1 [31] -1/-1/-1->0->-1 [32] -1/-1/-1->0->-1 [33] -1/-1/-1->0->-1 [34] -1/-1/-1->0->-1 [35] -1/-1/-1->0->-1 [36] -1/-1/-1->0->-1 [37] -1/-1/-1->0->-1 [38] -1/-1/-1->0->-1 [39] -1/-1/-1->0->-1 [40] -1/-1/-1->0->-1 [41] -1/-1/-1->0->-1 [42] -1/-1/-1->0->-1 [43] -1/-1/-1->0->-1 [44] -1/-1/-1->0->-1 [45] -1/-1/-1->0->-1 [46] -1/-1/-1->0
+lshn-qs-pjul-8:1217539:1218900 [2] NCCL INFO P2P Chunksize set to 524288
+lshn-qs-pjul-8:1217539:1218900 [2] NCCL INFO Check P2P Type isAllDirectP2p 0 directMode 0
+lshn-qs-pjul-8:1217539:1218904 [2] NCCL INFO [Proxy Service] Device 2 CPU core 188
+lshn-qs-pjul-8:1217539:1218905 [2] NCCL INFO [Proxy Service UDS] Device 2 CPU core 94
+lshn-qs-pjul-8:1217539:1218900 [2] NCCL INFO 64 coll channels, 64 collnet channels, 0 nvls channels, 64 p2p channels, 64 p2p channels per peer
+lshn-qs-pjul-8:1217539:1218900 [2] NCCL INFO CC Off, workFifoBytes 1048576
+lshn-qs-pjul-8:1217539:1218900 [2] NCCL INFO ncclCommSplit comm 0x205ea2f0 rank 0 nranks 1 cudaDev 2 nvmlDev 6 busId 1a3000 parent 0x1dced410 splitCount 13 color 1227022723 key 0 - Init COMPLETE
+lshn-qs-pjul-8:1217539:1218900 [2] NCCL INFO Init timings - ncclCommSplit: rank 0 nranks 1 total 0.03 (kernels 0.00, alloc 0.00, bootstrap 0.00, allgathers 0.00, topo 0.01, graphs 0.00, connections 0.02, rest 0.00)
+[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
+lshn-qs-pjul-8:1217539:1217539 [2] NCCL INFO Rank 2 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-pjul-8:1217539:1217539 [2] NCCL INFO Rank 2 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-pjul-8:1217538:1217538 [1] NCCL INFO Rank 1 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-pjul-8:1217537:1217537 [0] NCCL INFO Rank 0 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-pjul-8:1217540:1217540 [3] NCCL INFO Comm config Blocking set to 1
+lshn-qs-pjul-8:1217540:1218915 [3] NCCL INFO Assigned NET plugin Socket to comm
+lshn-qs-pjul-8:1217540:1218915 [3] NCCL INFO Using network Socket
+lshn-qs-pjul-8:1217539:1217539 [2] NCCL INFO Rank 2 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-pjul-8:1217538:1217538 [1] NCCL INFO Rank 1 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-pjul-8:1217540:1218915 [3] NCCL INFO ncclCommSplit comm 0x21a257b0 rank 0 nranks 1 cudaDev 3 nvmlDev 7 busId 1c7000 parent 0x1f12a6e0 splitCount 15 color 1301067556 key 0- Init START
+lshn-qs-pjul-8:1217537:1217537 [0] NCCL INFO Comm config Blocking set to 1
+lshn-qs-pjul-8:1217540:1218915 [3] NCCL INFO MNNVL busId 0x1c7000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0
+lshn-qs-pjul-8:1217540:1218915 [3] NCCL INFO Setting affinity for GPU 7 to 48-95,144-191
+lshn-qs-pjul-8:1217540:1218915 [3] NCCL INFO comm 0x21a257b0 rank 0 nRanks 1 nNodes 1 localRanks 1 localRank 0 MNNVL 0
+lshn-qs-pjul-8:1217540:1218915 [3] NCCL INFO Channel 00/64 : 0
+lshn-qs-pjul-8:1217540:1218915 [3] NCCL INFO Channel 01/64 : 0
+lshn-qs-pjul-8:1217540:1218915 [3] NCCL INFO Channel 02/64 : 0
+lshn-qs-pjul-8:1217540:1218915 [3] NCCL INFO Channel 03/64 : 0
+lshn-qs-pjul-8:1217540:1218915 [3] NCCL INFO Channel 04/64 : 0
+lshn-qs-pjul-8:1217540:1218915 [3] NCCL INFO Channel 05/64 : 0
+lshn-qs-pjul-8:1217540:1218915 [3] NCCL INFO Channel 06/64 : 0
+lshn-qs-pjul-8:1217540:1218915 [3] NCCL INFO Channel 07/64 : 0
+lshn-qs-pjul-8:1217540:1218915 [3] NCCL INFO Channel 08/64 : 0
+lshn-qs-pjul-8:1217540:1218915 [3] NCCL INFO Channel 09/64 : 0
+lshn-qs-pjul-8:1217540:1218915 [3] NCCL INFO Channel 10/64 : 0
+lshn-qs-pjul-8:1217540:1218915 [3] NCCL INFO Channel 11/64 : 0
+lshn-qs-pjul-8:1217540:1218915 [3] NCCL INFO Channel 12/64 : 0
+lshn-qs-pjul-8:1217540:1218915 [3] NCCL INFO Channel 13/64 : 0
+lshn-qs-pjul-8:1217540:1218915 [3] NCCL INFO Channel 14/64 : 0
+lshn-qs-pjul-8:1217540:1218915 [3] NCCL INFO Channel 15/64 : 0
+lshn-qs-pjul-8:1217540:1218915 [3] NCCL INFO Channel 16/64 : 0
+lshn-qs-pjul-8:1217540:1218915 [3] NCCL INFO Channel 17/64 : 0
+lshn-qs-pjul-8:1217540:1218915 [3] NCCL INFO Channel 18/64 : 0
+lshn-qs-pjul-8:1217540:1218915 [3] NCCL INFO Channel 19/64 : 0
+lshn-qs-pjul-8:1217540:1218915 [3] NCCL INFO Channel 20/64 : 0
+lshn-qs-pjul-8:1217540:1218915 [3] NCCL INFO Channel 21/64 : 0
+lshn-qs-pjul-8:1217540:1218915 [3] NCCL INFO Channel 22/64 : 0
+lshn-qs-pjul-8:1217540:1218915 [3] NCCL INFO Channel 23/64 : 0
+lshn-qs-pjul-8:1217540:1218915 [3] NCCL INFO Channel 24/64 : 0
+lshn-qs-pjul-8:1217540:1218915 [3] NCCL INFO Channel 25/64 : 0
+lshn-qs-pjul-8:1217540:1218915 [3] NCCL INFO Channel 26/64 : 0
+lshn-qs-pjul-8:1217540:1218915 [3] NCCL INFO Channel 27/64 : 0
+lshn-qs-pjul-8:1217540:1218915 [3] NCCL INFO Channel 28/64 : 0
+lshn-qs-pjul-8:1217540:1218915 [3] NCCL INFO Channel 29/64 : 0
+lshn-qs-pjul-8:1217540:1218915 [3] NCCL INFO Channel 30/64 : 0
+lshn-qs-pjul-8:1217540:1218915 [3] NCCL INFO Channel 31/64 : 0
+lshn-qs-pjul-8:1217540:1218915 [3] NCCL INFO Channel 32/64 : 0
+lshn-qs-pjul-8:1217540:1218915 [3] NCCL INFO Channel 33/64 : 0
+lshn-qs-pjul-8:1217540:1218915 [3] NCCL INFO Channel 34/64 : 0
+lshn-qs-pjul-8:1217540:1218915 [3] NCCL INFO Channel 35/64 : 0
+lshn-qs-pjul-8:1217540:1218915 [3] NCCL INFO Channel 36/64 : 0
+lshn-qs-pjul-8:1217540:1218915 [3] NCCL INFO Channel 37/64 : 0
+lshn-qs-pjul-8:1217540:1218915 [3] NCCL INFO Channel 38/64 : 0
+lshn-qs-pjul-8:1217540:1218915 [3] NCCL INFO Channel 39/64 : 0
+lshn-qs-pjul-8:1217540:1218915 [3] NCCL INFO Channel 40/64 : 0
+lshn-qs-pjul-8:1217540:1218915 [3] NCCL INFO Channel 41/64 : 0
+lshn-qs-pjul-8:1217540:1218915 [3] NCCL INFO Channel 42/64 : 0
+lshn-qs-pjul-8:1217540:1218915 [3] NCCL INFO Channel 43/64 : 0
+lshn-qs-pjul-8:1217540:1218915 [3] NCCL INFO Channel 44/64 : 0
+lshn-qs-pjul-8:1217540:1218915 [3] NCCL INFO Channel 45/64 : 0
+lshn-qs-pjul-8:1217540:1218915 [3] NCCL INFO Channel 46/64 : 0
+lshn-qs-pjul-8:1217540:1218915 [3] NCCL INFO Channel 47/64 : 0
+lshn-qs-pjul-8:1217540:1218915 [3] NCCL INFO Channel 48/64 : 0
+lshn-qs-pjul-8:1217540:1218915 [3] NCCL INFO Channel 49/64 : 0
+lshn-qs-pjul-8:1217540:1218915 [3] NCCL INFO Channel 50/64 : 0
+lshn-qs-pjul-8:1217540:1218915 [3] NCCL INFO Channel 51/64 : 0
+lshn-qs-pjul-8:1217540:1218915 [3] NCCL INFO Channel 52/64 : 0
+lshn-qs-pjul-8:1217540:1218915 [3] NCCL INFO Channel 53/64 : 0
+lshn-qs-pjul-8:1217540:1218915 [3] NCCL INFO Channel 54/64 : 0
+lshn-qs-pjul-8:1217540:1218915 [3] NCCL INFO Channel 55/64 : 0
+lshn-qs-pjul-8:1217540:1218915 [3] NCCL INFO Channel 56/64 : 0
+lshn-qs-pjul-8:1217540:1218915 [3] NCCL INFO Channel 57/64 : 0
+lshn-qs-pjul-8:1217540:1218915 [3] NCCL INFO Channel 58/64 : 0
+lshn-qs-pjul-8:1217540:1218915 [3] NCCL INFO Channel 59/64 : 0
+lshn-qs-pjul-8:1217540:1218915 [3] NCCL INFO Channel 60/64 : 0
+lshn-qs-pjul-8:1217540:1218915 [3] NCCL INFO Channel 61/64 : 0
+lshn-qs-pjul-8:1217540:1218915 [3] NCCL INFO Channel 62/64 : 0
+lshn-qs-pjul-8:1217540:1218915 [3] NCCL INFO Channel 63/64 : 0
+lshn-qs-pjul-8:1217540:1218915 [3] NCCL INFO Trees [0] -1/-1/-1->0->-1 [1] -1/-1/-1->0->-1 [2] -1/-1/-1->0->-1 [3] -1/-1/-1->0->-1 [4] -1/-1/-1->0->-1 [5] -1/-1/-1->0->-1 [6] -1/-1/-1->0->-1 [7] -1/-1/-1->0->-1 [8] -1/-1/-1->0->-1 [9] -1/-1/-1->0->-1 [10] -1/-1/-1->0->-1 [11] -1/-1/-1->0->-1 [12] -1/-1/-1->0->-1 [13] -1/-1/-1->0->-1 [14] -1/-1/-1->0->-1 [15] -1/-1/-1->0->-1 [16] -1/-1/-1->0->-1 [17] -1/-1/-1->0->-1 [18] -1/-1/-1->0->-1 [19] -1/-1/-1->0->-1 [20] -1/-1/-1->0->-1 [21] -1/-1/-1->0->-1 [22] -1/-1/-1->0->-1 [23] -1/-1/-1->0->-1 [24] -1/-1/-1->0->-1 [25] -1/-1/-1->0->-1 [26] -1/-1/-1->0->-1 [27] -1/-1/-1->0->-1 [28] -1/-1/-1->0->-1 [29] -1/-1/-1->0->-1 [30] -1/-1/-1->0->-1 [31] -1/-1/-1->0->-1 [32] -1/-1/-1->0->-1 [33] -1/-1/-1->0->-1 [34] -1/-1/-1->0->-1 [35] -1/-1/-1->0->-1 [36] -1/-1/-1->0->-1 [37] -1/-1/-1->0->-1 [38] -1/-1/-1->0->-1 [39] -1/-1/-1->0->-1 [40] -1/-1/-1->0->-1 [41] -1/-1/-1->0->-1 [42] -1/-1/-1->0->-1 [43] -1/-1/-1->0->-1 [44] -1/-1/-1->0->-1 [45] -1/-1/-1->0->-1 [46] -1/-1/-1->0
+lshn-qs-pjul-8:1217540:1218915 [3] NCCL INFO P2P Chunksize set to 524288
+lshn-qs-pjul-8:1217540:1218915 [3] NCCL INFO Check P2P Type isAllDirectP2p 0 directMode 0
+lshn-qs-pjul-8:1217540:1218921 [3] NCCL INFO [Proxy Service] Device 3 CPU core 67
+lshn-qs-pjul-8:1217540:1218922 [3] NCCL INFO [Proxy Service UDS] Device 3 CPU core 164
+lshn-qs-pjul-8:1217540:1218915 [3] NCCL INFO 64 coll channels, 64 collnet channels, 0 nvls channels, 64 p2p channels, 64 p2p channels per peer
+lshn-qs-pjul-8:1217540:1218915 [3] NCCL INFO CC Off, workFifoBytes 1048576
+lshn-qs-pjul-8:1217540:1218915 [3] NCCL INFO ncclCommSplit comm 0x21a257b0 rank 0 nranks 1 cudaDev 3 nvmlDev 7 busId 1c7000 parent 0x1f12a6e0 splitCount 15 color 1301067556 key 0 - Init COMPLETE
+lshn-qs-pjul-8:1217540:1218915 [3] NCCL INFO Init timings - ncclCommSplit: rank 0 nranks 1 total 0.03 (kernels 0.00, alloc 0.00, bootstrap 0.00, allgathers 0.00, topo 0.01, graphs 0.00, connections 0.02, rest 0.00)
+[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
+lshn-qs-pjul-8:1217540:1217540 [3] NCCL INFO Rank 3 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-pjul-8:1217537:1218920 [0] NCCL INFO Assigned NET plugin Socket to comm
+lshn-qs-pjul-8:1217537:1218920 [0] NCCL INFO Using network Socket
+lshn-qs-pjul-8:1217539:1217539 [2] NCCL INFO Rank 2 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-pjul-8:1217540:1217540 [3] NCCL INFO Rank 3 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-pjul-8:1217538:1217538 [1] NCCL INFO Rank 1 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-pjul-8:1217537:1218920 [0] NCCL INFO ncclCommSplit comm 0x1fec8260 rank 0 nranks 1 cudaDev 0 nvmlDev 4 busId 109000 parent 0x1ce20510 splitCount 16 color 59908776 key 0- Init START
+lshn-qs-pjul-8:1217537:1218920 [0] NCCL INFO MNNVL busId 0x109000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0
+lshn-qs-pjul-8:1217537:1218920 [0] NCCL INFO Setting affinity for GPU 4 to 48-95,144-191
+lshn-qs-pjul-8:1217537:1218920 [0] NCCL INFO comm 0x1fec8260 rank 0 nRanks 1 nNodes 1 localRanks 1 localRank 0 MNNVL 0
+lshn-qs-pjul-8:1217537:1218920 [0] NCCL INFO Channel 00/64 : 0
+lshn-qs-pjul-8:1217537:1218920 [0] NCCL INFO Channel 01/64 : 0
+lshn-qs-pjul-8:1217537:1218920 [0] NCCL INFO Channel 02/64 : 0
+lshn-qs-pjul-8:1217537:1218920 [0] NCCL INFO Channel 03/64 : 0
+lshn-qs-pjul-8:1217537:1218920 [0] NCCL INFO Channel 04/64 : 0
+lshn-qs-pjul-8:1217537:1218920 [0] NCCL INFO Channel 05/64 : 0
+lshn-qs-pjul-8:1217537:1218920 [0] NCCL INFO Channel 06/64 : 0
+lshn-qs-pjul-8:1217537:1218920 [0] NCCL INFO Channel 07/64 : 0
+lshn-qs-pjul-8:1217537:1218920 [0] NCCL INFO Channel 08/64 : 0
+lshn-qs-pjul-8:1217537:1218920 [0] NCCL INFO Channel 09/64 : 0
+lshn-qs-pjul-8:1217537:1218920 [0] NCCL INFO Channel 10/64 : 0
+lshn-qs-pjul-8:1217537:1218920 [0] NCCL INFO Channel 11/64 : 0
+lshn-qs-pjul-8:1217537:1218920 [0] NCCL INFO Channel 12/64 : 0
+lshn-qs-pjul-8:1217537:1218920 [0] NCCL INFO Channel 13/64 : 0
+lshn-qs-pjul-8:1217537:1218920 [0] NCCL INFO Channel 14/64 : 0
+lshn-qs-pjul-8:1217537:1218920 [0] NCCL INFO Channel 15/64 : 0
+lshn-qs-pjul-8:1217537:1218920 [0] NCCL INFO Channel 16/64 : 0
+lshn-qs-pjul-8:1217537:1218920 [0] NCCL INFO Channel 17/64 : 0
+lshn-qs-pjul-8:1217537:1218920 [0] NCCL INFO Channel 18/64 : 0
+lshn-qs-pjul-8:1217537:1218920 [0] NCCL INFO Channel 19/64 : 0
+lshn-qs-pjul-8:1217537:1218920 [0] NCCL INFO Channel 20/64 : 0
+lshn-qs-pjul-8:1217537:1218920 [0] NCCL INFO Channel 21/64 : 0
+lshn-qs-pjul-8:1217537:1218920 [0] NCCL INFO Channel 22/64 : 0
+lshn-qs-pjul-8:1217537:1218920 [0] NCCL INFO Channel 23/64 : 0
+lshn-qs-pjul-8:1217537:1218920 [0] NCCL INFO Channel 24/64 : 0
+lshn-qs-pjul-8:1217537:1218920 [0] NCCL INFO Channel 25/64 : 0
+lshn-qs-pjul-8:1217537:1218920 [0] NCCL INFO Channel 26/64 : 0
+lshn-qs-pjul-8:1217537:1218920 [0] NCCL INFO Channel 27/64 : 0
+lshn-qs-pjul-8:1217537:1218920 [0] NCCL INFO Channel 28/64 : 0
+lshn-qs-pjul-8:1217537:1218920 [0] NCCL INFO Channel 29/64 : 0
+lshn-qs-pjul-8:1217537:1218920 [0] NCCL INFO Channel 30/64 : 0
+lshn-qs-pjul-8:1217537:1218920 [0] NCCL INFO Channel 31/64 : 0
+lshn-qs-pjul-8:1217537:1218920 [0] NCCL INFO Channel 32/64 : 0
+lshn-qs-pjul-8:1217537:1218920 [0] NCCL INFO Channel 33/64 : 0
+lshn-qs-pjul-8:1217537:1218920 [0] NCCL INFO Channel 34/64 : 0
+lshn-qs-pjul-8:1217537:1218920 [0] NCCL INFO Channel 35/64 : 0
+lshn-qs-pjul-8:1217537:1218920 [0] NCCL INFO Channel 36/64 : 0
+lshn-qs-pjul-8:1217537:1218920 [0] NCCL INFO Channel 37/64 : 0
+lshn-qs-pjul-8:1217537:1218920 [0] NCCL INFO Channel 38/64 : 0
+lshn-qs-pjul-8:1217537:1218920 [0] NCCL INFO Channel 39/64 : 0
+lshn-qs-pjul-8:1217537:1218920 [0] NCCL INFO Channel 40/64 : 0
+lshn-qs-pjul-8:1217537:1218920 [0] NCCL INFO Channel 41/64 : 0
+lshn-qs-pjul-8:1217537:1218920 [0] NCCL INFO Channel 42/64 : 0
+lshn-qs-pjul-8:1217537:1218920 [0] NCCL INFO Channel 43/64 : 0
+lshn-qs-pjul-8:1217537:1218920 [0] NCCL INFO Channel 44/64 : 0
+lshn-qs-pjul-8:1217537:1218920 [0] NCCL INFO Channel 45/64 : 0
+lshn-qs-pjul-8:1217537:1218920 [0] NCCL INFO Channel 46/64 : 0
+lshn-qs-pjul-8:1217537:1218920 [0] NCCL INFO Channel 47/64 : 0
+lshn-qs-pjul-8:1217537:1218920 [0] NCCL INFO Channel 48/64 : 0
+lshn-qs-pjul-8:1217537:1218920 [0] NCCL INFO Channel 49/64 : 0
+lshn-qs-pjul-8:1217537:1218920 [0] NCCL INFO Channel 50/64 : 0
+lshn-qs-pjul-8:1217537:1218920 [0] NCCL INFO Channel 51/64 : 0
+lshn-qs-pjul-8:1217537:1218920 [0] NCCL INFO Channel 52/64 : 0
+lshn-qs-pjul-8:1217537:1218920 [0] NCCL INFO Channel 53/64 : 0
+lshn-qs-pjul-8:1217537:1218920 [0] NCCL INFO Channel 54/64 : 0
+lshn-qs-pjul-8:1217537:1218920 [0] NCCL INFO Channel 55/64 : 0
+lshn-qs-pjul-8:1217537:1218920 [0] NCCL INFO Channel 56/64 : 0
+lshn-qs-pjul-8:1217537:1218920 [0] NCCL INFO Channel 57/64 : 0
+lshn-qs-pjul-8:1217537:1218920 [0] NCCL INFO Channel 58/64 : 0
+lshn-qs-pjul-8:1217537:1218920 [0] NCCL INFO Channel 59/64 : 0
+lshn-qs-pjul-8:1217537:1218920 [0] NCCL INFO Channel 60/64 : 0
+lshn-qs-pjul-8:1217537:1218920 [0] NCCL INFO Channel 61/64 : 0
+lshn-qs-pjul-8:1217537:1218920 [0] NCCL INFO Channel 62/64 : 0
+lshn-qs-pjul-8:1217537:1218920 [0] NCCL INFO Channel 63/64 : 0
+lshn-qs-pjul-8:1217537:1218920 [0] NCCL INFO Trees [0] -1/-1/-1->0->-1 [1] -1/-1/-1->0->-1 [2] -1/-1/-1->0->-1 [3] -1/-1/-1->0->-1 [4] -1/-1/-1->0->-1 [5] -1/-1/-1->0->-1 [6] -1/-1/-1->0->-1 [7] -1/-1/-1->0->-1 [8] -1/-1/-1->0->-1 [9] -1/-1/-1->0->-1 [10] -1/-1/-1->0->-1 [11] -1/-1/-1->0->-1 [12] -1/-1/-1->0->-1 [13] -1/-1/-1->0->-1 [14] -1/-1/-1->0->-1 [15] -1/-1/-1->0->-1 [16] -1/-1/-1->0->-1 [17] -1/-1/-1->0->-1 [18] -1/-1/-1->0->-1 [19] -1/-1/-1->0->-1 [20] -1/-1/-1->0->-1 [21] -1/-1/-1->0->-1 [22] -1/-1/-1->0->-1 [23] -1/-1/-1->0->-1 [24] -1/-1/-1->0->-1 [25] -1/-1/-1->0->-1 [26] -1/-1/-1->0->-1 [27] -1/-1/-1->0->-1 [28] -1/-1/-1->0->-1 [29] -1/-1/-1->0->-1 [30] -1/-1/-1->0->-1 [31] -1/-1/-1->0->-1 [32] -1/-1/-1->0->-1 [33] -1/-1/-1->0->-1 [34] -1/-1/-1->0->-1 [35] -1/-1/-1->0->-1 [36] -1/-1/-1->0->-1 [37] -1/-1/-1->0->-1 [38] -1/-1/-1->0->-1 [39] -1/-1/-1->0->-1 [40] -1/-1/-1->0->-1 [41] -1/-1/-1->0->-1 [42] -1/-1/-1->0->-1 [43] -1/-1/-1->0->-1 [44] -1/-1/-1->0->-1 [45] -1/-1/-1->0->-1 [46] -1/-1/-1->0
+lshn-qs-pjul-8:1217537:1218920 [0] NCCL INFO P2P Chunksize set to 524288
+lshn-qs-pjul-8:1217537:1218920 [0] NCCL INFO Check P2P Type isAllDirectP2p 0 directMode 0
+lshn-qs-pjul-8:1217537:1218930 [0] NCCL INFO [Proxy Service] Device 0 CPU core 150
+lshn-qs-pjul-8:1217537:1218931 [0] NCCL INFO [Proxy Service UDS] Device 0 CPU core 171
+lshn-qs-pjul-8:1217537:1218920 [0] NCCL INFO 64 coll channels, 64 collnet channels, 0 nvls channels, 64 p2p channels, 64 p2p channels per peer
+lshn-qs-pjul-8:1217537:1218920 [0] NCCL INFO CC Off, workFifoBytes 1048576
+lshn-qs-pjul-8:1217537:1218920 [0] NCCL INFO ncclCommSplit comm 0x1fec8260 rank 0 nranks 1 cudaDev 0 nvmlDev 4 busId 109000 parent 0x1ce20510 splitCount 16 color 59908776 key 0 - Init COMPLETE
+lshn-qs-pjul-8:1217537:1218920 [0] NCCL INFO Init timings - ncclCommSplit: rank 0 nranks 1 total 0.07 (kernels 0.00, alloc 0.00, bootstrap 0.00, allgathers 0.00, topo 0.01, graphs 0.00, connections 0.02, rest 0.04)
+[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
+lshn-qs-pjul-8:1217537:1217537 [0] NCCL INFO Rank 0 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-pjul-8:1217540:1217540 [3] NCCL INFO Rank 3 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-pjul-8:1217537:1217537 [0] NCCL INFO Rank 0 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-pjul-8:1217539:1217539 [2] NCCL INFO Rank 2 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-pjul-8:1217538:1217538 [1] NCCL INFO Comm config Blocking set to 1
+lshn-qs-pjul-8:1217538:1218941 [1] NCCL INFO Assigned NET plugin Socket to comm
+lshn-qs-pjul-8:1217538:1218941 [1] NCCL INFO Using network Socket
+lshn-qs-pjul-8:1217540:1217540 [3] NCCL INFO Rank 3 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-pjul-8:1217539:1217539 [2] NCCL INFO Rank 2 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-pjul-8:1217537:1217537 [0] NCCL INFO Rank 0 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-pjul-8:1217538:1218941 [1] NCCL INFO ncclCommSplit comm 0x21787fd0 rank 0 nranks 1 cudaDev 1 nvmlDev 5 busId 17f000 parent 0x1ed808d0 splitCount 18 color 440515407 key 0- Init START
+lshn-qs-pjul-8:1217538:1218941 [1] NCCL INFO MNNVL busId 0x17f000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0
+lshn-qs-pjul-8:1217538:1218941 [1] NCCL INFO Setting affinity for GPU 5 to 48-95,144-191
+lshn-qs-pjul-8:1217538:1218941 [1] NCCL INFO comm 0x21787fd0 rank 0 nRanks 1 nNodes 1 localRanks 1 localRank 0 MNNVL 0
+lshn-qs-pjul-8:1217538:1218941 [1] NCCL INFO Channel 00/64 : 0
+lshn-qs-pjul-8:1217538:1218941 [1] NCCL INFO Channel 01/64 : 0
+lshn-qs-pjul-8:1217538:1218941 [1] NCCL INFO Channel 02/64 : 0
+lshn-qs-pjul-8:1217538:1218941 [1] NCCL INFO Channel 03/64 : 0
+lshn-qs-pjul-8:1217538:1218941 [1] NCCL INFO Channel 04/64 : 0
+lshn-qs-pjul-8:1217538:1218941 [1] NCCL INFO Channel 05/64 : 0
+lshn-qs-pjul-8:1217538:1218941 [1] NCCL INFO Channel 06/64 : 0
+lshn-qs-pjul-8:1217538:1218941 [1] NCCL INFO Channel 07/64 : 0
+lshn-qs-pjul-8:1217538:1218941 [1] NCCL INFO Channel 08/64 : 0
+lshn-qs-pjul-8:1217538:1218941 [1] NCCL INFO Channel 09/64 : 0
+lshn-qs-pjul-8:1217538:1218941 [1] NCCL INFO Channel 10/64 : 0
+lshn-qs-pjul-8:1217538:1218941 [1] NCCL INFO Channel 11/64 : 0
+lshn-qs-pjul-8:1217538:1218941 [1] NCCL INFO Channel 12/64 : 0
+lshn-qs-pjul-8:1217538:1218941 [1] NCCL INFO Channel 13/64 : 0
+lshn-qs-pjul-8:1217538:1218941 [1] NCCL INFO Channel 14/64 : 0
+lshn-qs-pjul-8:1217538:1218941 [1] NCCL INFO Channel 15/64 : 0
+lshn-qs-pjul-8:1217538:1218941 [1] NCCL INFO Channel 16/64 : 0
+lshn-qs-pjul-8:1217538:1218941 [1] NCCL INFO Channel 17/64 : 0
+lshn-qs-pjul-8:1217538:1218941 [1] NCCL INFO Channel 18/64 : 0
+lshn-qs-pjul-8:1217538:1218941 [1] NCCL INFO Channel 19/64 : 0
+lshn-qs-pjul-8:1217538:1218941 [1] NCCL INFO Channel 20/64 : 0
+lshn-qs-pjul-8:1217538:1218941 [1] NCCL INFO Channel 21/64 : 0
+lshn-qs-pjul-8:1217538:1218941 [1] NCCL INFO Channel 22/64 : 0
+lshn-qs-pjul-8:1217538:1218941 [1] NCCL INFO Channel 23/64 : 0
+lshn-qs-pjul-8:1217538:1218941 [1] NCCL INFO Channel 24/64 : 0
+lshn-qs-pjul-8:1217538:1218941 [1] NCCL INFO Channel 25/64 : 0
+lshn-qs-pjul-8:1217538:1218941 [1] NCCL INFO Channel 26/64 : 0
+lshn-qs-pjul-8:1217538:1218941 [1] NCCL INFO Channel 27/64 : 0
+lshn-qs-pjul-8:1217538:1218941 [1] NCCL INFO Channel 28/64 : 0
+lshn-qs-pjul-8:1217538:1218941 [1] NCCL INFO Channel 29/64 : 0
+lshn-qs-pjul-8:1217538:1218941 [1] NCCL INFO Channel 30/64 : 0
+lshn-qs-pjul-8:1217538:1218941 [1] NCCL INFO Channel 31/64 : 0
+lshn-qs-pjul-8:1217538:1218941 [1] NCCL INFO Channel 32/64 : 0
+lshn-qs-pjul-8:1217538:1218941 [1] NCCL INFO Channel 33/64 : 0
+lshn-qs-pjul-8:1217538:1218941 [1] NCCL INFO Channel 34/64 : 0
+lshn-qs-pjul-8:1217538:1218941 [1] NCCL INFO Channel 35/64 : 0
+lshn-qs-pjul-8:1217538:1218941 [1] NCCL INFO Channel 36/64 : 0
+lshn-qs-pjul-8:1217538:1218941 [1] NCCL INFO Channel 37/64 : 0
+lshn-qs-pjul-8:1217538:1218941 [1] NCCL INFO Channel 38/64 : 0
+lshn-qs-pjul-8:1217538:1218941 [1] NCCL INFO Channel 39/64 : 0
+lshn-qs-pjul-8:1217538:1218941 [1] NCCL INFO Channel 40/64 : 0
+lshn-qs-pjul-8:1217538:1218941 [1] NCCL INFO Channel 41/64 : 0
+lshn-qs-pjul-8:1217538:1218941 [1] NCCL INFO Channel 42/64 : 0
+lshn-qs-pjul-8:1217538:1218941 [1] NCCL INFO Channel 43/64 : 0
+lshn-qs-pjul-8:1217538:1218941 [1] NCCL INFO Channel 44/64 : 0
+lshn-qs-pjul-8:1217538:1218941 [1] NCCL INFO Channel 45/64 : 0
+lshn-qs-pjul-8:1217538:1218941 [1] NCCL INFO Channel 46/64 : 0
+lshn-qs-pjul-8:1217538:1218941 [1] NCCL INFO Channel 47/64 : 0
+lshn-qs-pjul-8:1217538:1218941 [1] NCCL INFO Channel 48/64 : 0
+lshn-qs-pjul-8:1217538:1218941 [1] NCCL INFO Channel 49/64 : 0
+lshn-qs-pjul-8:1217538:1218941 [1] NCCL INFO Channel 50/64 : 0
+lshn-qs-pjul-8:1217538:1218941 [1] NCCL INFO Channel 51/64 : 0
+lshn-qs-pjul-8:1217538:1218941 [1] NCCL INFO Channel 52/64 : 0
+lshn-qs-pjul-8:1217538:1218941 [1] NCCL INFO Channel 53/64 : 0
+lshn-qs-pjul-8:1217538:1218941 [1] NCCL INFO Channel 54/64 : 0
+lshn-qs-pjul-8:1217538:1218941 [1] NCCL INFO Channel 55/64 : 0
+lshn-qs-pjul-8:1217538:1218941 [1] NCCL INFO Channel 56/64 : 0
+lshn-qs-pjul-8:1217538:1218941 [1] NCCL INFO Channel 57/64 : 0
+lshn-qs-pjul-8:1217538:1218941 [1] NCCL INFO Channel 58/64 : 0
+lshn-qs-pjul-8:1217538:1218941 [1] NCCL INFO Channel 59/64 : 0
+lshn-qs-pjul-8:1217538:1218941 [1] NCCL INFO Channel 60/64 : 0
+lshn-qs-pjul-8:1217538:1218941 [1] NCCL INFO Channel 61/64 : 0
+lshn-qs-pjul-8:1217538:1218941 [1] NCCL INFO Channel 62/64 : 0
+lshn-qs-pjul-8:1217538:1218941 [1] NCCL INFO Channel 63/64 : 0
+lshn-qs-pjul-8:1217538:1218941 [1] NCCL INFO Trees [0] -1/-1/-1->0->-1 [1] -1/-1/-1->0->-1 [2] -1/-1/-1->0->-1 [3] -1/-1/-1->0->-1 [4] -1/-1/-1->0->-1 [5] -1/-1/-1->0->-1 [6] -1/-1/-1->0->-1 [7] -1/-1/-1->0->-1 [8] -1/-1/-1->0->-1 [9] -1/-1/-1->0->-1 [10] -1/-1/-1->0->-1 [11] -1/-1/-1->0->-1 [12] -1/-1/-1->0->-1 [13] -1/-1/-1->0->-1 [14] -1/-1/-1->0->-1 [15] -1/-1/-1->0->-1 [16] -1/-1/-1->0->-1 [17] -1/-1/-1->0->-1 [18] -1/-1/-1->0->-1 [19] -1/-1/-1->0->-1 [20] -1/-1/-1->0->-1 [21] -1/-1/-1->0->-1 [22] -1/-1/-1->0->-1 [23] -1/-1/-1->0->-1 [24] -1/-1/-1->0->-1 [25] -1/-1/-1->0->-1 [26] -1/-1/-1->0->-1 [27] -1/-1/-1->0->-1 [28] -1/-1/-1->0->-1 [29] -1/-1/-1->0->-1 [30] -1/-1/-1->0->-1 [31] -1/-1/-1->0->-1 [32] -1/-1/-1->0->-1 [33] -1/-1/-1->0->-1 [34] -1/-1/-1->0->-1 [35] -1/-1/-1->0->-1 [36] -1/-1/-1->0->-1 [37] -1/-1/-1->0->-1 [38] -1/-1/-1->0->-1 [39] -1/-1/-1->0->-1 [40] -1/-1/-1->0->-1 [41] -1/-1/-1->0->-1 [42] -1/-1/-1->0->-1 [43] -1/-1/-1->0->-1 [44] -1/-1/-1->0->-1 [45] -1/-1/-1->0->-1 [46] -1/-1/-1->0
+lshn-qs-pjul-8:1217538:1218941 [1] NCCL INFO P2P Chunksize set to 524288
+lshn-qs-pjul-8:1217538:1218941 [1] NCCL INFO Check P2P Type isAllDirectP2p 0 directMode 0
+lshn-qs-pjul-8:1217538:1218945 [1] NCCL INFO [Proxy Service] Device 1 CPU core 60
+lshn-qs-pjul-8:1217538:1218946 [1] NCCL INFO [Proxy Service UDS] Device 1 CPU core 169
+lshn-qs-pjul-8:1217538:1218941 [1] NCCL INFO 64 coll channels, 64 collnet channels, 0 nvls channels, 64 p2p channels, 64 p2p channels per peer
+lshn-qs-pjul-8:1217538:1218941 [1] NCCL INFO CC Off, workFifoBytes 1048576
+lshn-qs-pjul-8:1217538:1218941 [1] NCCL INFO ncclCommSplit comm 0x21787fd0 rank 0 nranks 1 cudaDev 1 nvmlDev 5 busId 17f000 parent 0x1ed808d0 splitCount 18 color 440515407 key 0 - Init COMPLETE
+lshn-qs-pjul-8:1217538:1218941 [1] NCCL INFO Init timings - ncclCommSplit: rank 0 nranks 1 total 0.07 (kernels 0.00, alloc 0.00, bootstrap 0.00, allgathers 0.00, topo 0.01, graphs 0.00, connections 0.06, rest 0.00)
+[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
+lshn-qs-pjul-8:1217538:1217538 [1] NCCL INFO Rank 1 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-pjul-8:1217538:1217538 [1] NCCL INFO Rank 1 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-pjul-8:1217540:1217540 [3] NCCL INFO Rank 3 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-pjul-8:1217537:1217537 [0] NCCL INFO Rank 0 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-pjul-8:1217539:1217539 [2] NCCL INFO Comm config Blocking set to 1
+lshn-qs-pjul-8:1217539:1218956 [2] NCCL INFO Assigned NET plugin Socket to comm
+lshn-qs-pjul-8:1217539:1218956 [2] NCCL INFO Using network Socket
+lshn-qs-pjul-8:1217538:1217538 [1] NCCL INFO Rank 1 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-pjul-8:1217537:1217537 [0] NCCL INFO Rank 0 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-pjul-8:1217540:1217540 [3] NCCL INFO Rank 3 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-pjul-8:1217539:1218956 [2] NCCL INFO ncclCommSplit comm 0x206f1f00 rank 0 nranks 1 cudaDev 2 nvmlDev 6 busId 1a3000 parent 0x1dced410 splitCount 20 color 1227022723 key 0- Init START
+lshn-qs-pjul-8:1217539:1218956 [2] NCCL INFO MNNVL busId 0x1a3000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0
+lshn-qs-pjul-8:1217539:1218956 [2] NCCL INFO Setting affinity for GPU 6 to 48-95,144-191
+lshn-qs-pjul-8:1217539:1218956 [2] NCCL INFO comm 0x206f1f00 rank 0 nRanks 1 nNodes 1 localRanks 1 localRank 0 MNNVL 0
+lshn-qs-pjul-8:1217539:1218956 [2] NCCL INFO Channel 00/64 : 0
+lshn-qs-pjul-8:1217539:1218956 [2] NCCL INFO Channel 01/64 : 0
+lshn-qs-pjul-8:1217539:1218956 [2] NCCL INFO Channel 02/64 : 0
+lshn-qs-pjul-8:1217539:1218956 [2] NCCL INFO Channel 03/64 : 0
+lshn-qs-pjul-8:1217539:1218956 [2] NCCL INFO Channel 04/64 : 0
+lshn-qs-pjul-8:1217539:1218956 [2] NCCL INFO Channel 05/64 : 0
+lshn-qs-pjul-8:1217539:1218956 [2] NCCL INFO Channel 06/64 : 0
+lshn-qs-pjul-8:1217539:1218956 [2] NCCL INFO Channel 07/64 : 0
+lshn-qs-pjul-8:1217539:1218956 [2] NCCL INFO Channel 08/64 : 0
+lshn-qs-pjul-8:1217539:1218956 [2] NCCL INFO Channel 09/64 : 0
+lshn-qs-pjul-8:1217539:1218956 [2] NCCL INFO Channel 10/64 : 0
+lshn-qs-pjul-8:1217539:1218956 [2] NCCL INFO Channel 11/64 : 0
+lshn-qs-pjul-8:1217539:1218956 [2] NCCL INFO Channel 12/64 : 0
+lshn-qs-pjul-8:1217539:1218956 [2] NCCL INFO Channel 13/64 : 0
+lshn-qs-pjul-8:1217539:1218956 [2] NCCL INFO Channel 14/64 : 0
+lshn-qs-pjul-8:1217539:1218956 [2] NCCL INFO Channel 15/64 : 0
+lshn-qs-pjul-8:1217539:1218956 [2] NCCL INFO Channel 16/64 : 0
+lshn-qs-pjul-8:1217539:1218956 [2] NCCL INFO Channel 17/64 : 0
+lshn-qs-pjul-8:1217539:1218956 [2] NCCL INFO Channel 18/64 : 0
+lshn-qs-pjul-8:1217539:1218956 [2] NCCL INFO Channel 19/64 : 0
+lshn-qs-pjul-8:1217539:1218956 [2] NCCL INFO Channel 20/64 : 0
+lshn-qs-pjul-8:1217539:1218956 [2] NCCL INFO Channel 21/64 : 0
+lshn-qs-pjul-8:1217539:1218956 [2] NCCL INFO Channel 22/64 : 0
+lshn-qs-pjul-8:1217539:1218956 [2] NCCL INFO Channel 23/64 : 0
+lshn-qs-pjul-8:1217539:1218956 [2] NCCL INFO Channel 24/64 : 0
+lshn-qs-pjul-8:1217539:1218956 [2] NCCL INFO Channel 25/64 : 0
+lshn-qs-pjul-8:1217539:1218956 [2] NCCL INFO Channel 26/64 : 0
+lshn-qs-pjul-8:1217539:1218956 [2] NCCL INFO Channel 27/64 : 0
+lshn-qs-pjul-8:1217539:1218956 [2] NCCL INFO Channel 28/64 : 0
+lshn-qs-pjul-8:1217539:1218956 [2] NCCL INFO Channel 29/64 : 0
+lshn-qs-pjul-8:1217539:1218956 [2] NCCL INFO Channel 30/64 : 0
+lshn-qs-pjul-8:1217539:1218956 [2] NCCL INFO Channel 31/64 : 0
+lshn-qs-pjul-8:1217539:1218956 [2] NCCL INFO Channel 32/64 : 0
+lshn-qs-pjul-8:1217539:1218956 [2] NCCL INFO Channel 33/64 : 0
+lshn-qs-pjul-8:1217539:1218956 [2] NCCL INFO Channel 34/64 : 0
+lshn-qs-pjul-8:1217539:1218956 [2] NCCL INFO Channel 35/64 : 0
+lshn-qs-pjul-8:1217539:1218956 [2] NCCL INFO Channel 36/64 : 0
+lshn-qs-pjul-8:1217539:1218956 [2] NCCL INFO Channel 37/64 : 0
+lshn-qs-pjul-8:1217539:1218956 [2] NCCL INFO Channel 38/64 : 0
+lshn-qs-pjul-8:1217539:1218956 [2] NCCL INFO Channel 39/64 : 0
+lshn-qs-pjul-8:1217539:1218956 [2] NCCL INFO Channel 40/64 : 0
+lshn-qs-pjul-8:1217539:1218956 [2] NCCL INFO Channel 41/64 : 0
+lshn-qs-pjul-8:1217539:1218956 [2] NCCL INFO Channel 42/64 : 0
+lshn-qs-pjul-8:1217539:1218956 [2] NCCL INFO Channel 43/64 : 0
+lshn-qs-pjul-8:1217539:1218956 [2] NCCL INFO Channel 44/64 : 0
+lshn-qs-pjul-8:1217539:1218956 [2] NCCL INFO Channel 45/64 : 0
+lshn-qs-pjul-8:1217539:1218956 [2] NCCL INFO Channel 46/64 : 0
+lshn-qs-pjul-8:1217539:1218956 [2] NCCL INFO Channel 47/64 : 0
+lshn-qs-pjul-8:1217539:1218956 [2] NCCL INFO Channel 48/64 : 0
+lshn-qs-pjul-8:1217539:1218956 [2] NCCL INFO Channel 49/64 : 0
+lshn-qs-pjul-8:1217539:1218956 [2] NCCL INFO Channel 50/64 : 0
+lshn-qs-pjul-8:1217539:1218956 [2] NCCL INFO Channel 51/64 : 0
+lshn-qs-pjul-8:1217539:1218956 [2] NCCL INFO Channel 52/64 : 0
+lshn-qs-pjul-8:1217539:1218956 [2] NCCL INFO Channel 53/64 : 0
+lshn-qs-pjul-8:1217539:1218956 [2] NCCL INFO Channel 54/64 : 0
+lshn-qs-pjul-8:1217539:1218956 [2] NCCL INFO Channel 55/64 : 0
+lshn-qs-pjul-8:1217539:1218956 [2] NCCL INFO Channel 56/64 : 0
+lshn-qs-pjul-8:1217539:1218956 [2] NCCL INFO Channel 57/64 : 0
+lshn-qs-pjul-8:1217539:1218956 [2] NCCL INFO Channel 58/64 : 0
+lshn-qs-pjul-8:1217539:1218956 [2] NCCL INFO Channel 59/64 : 0
+lshn-qs-pjul-8:1217539:1218956 [2] NCCL INFO Channel 60/64 : 0
+lshn-qs-pjul-8:1217539:1218956 [2] NCCL INFO Channel 61/64 : 0
+lshn-qs-pjul-8:1217539:1218956 [2] NCCL INFO Channel 62/64 : 0
+lshn-qs-pjul-8:1217539:1218956 [2] NCCL INFO Channel 63/64 : 0
+lshn-qs-pjul-8:1217539:1218956 [2] NCCL INFO Trees [0] -1/-1/-1->0->-1 [1] -1/-1/-1->0->-1 [2] -1/-1/-1->0->-1 [3] -1/-1/-1->0->-1 [4] -1/-1/-1->0->-1 [5] -1/-1/-1->0->-1 [6] -1/-1/-1->0->-1 [7] -1/-1/-1->0->-1 [8] -1/-1/-1->0->-1 [9] -1/-1/-1->0->-1 [10] -1/-1/-1->0->-1 [11] -1/-1/-1->0->-1 [12] -1/-1/-1->0->-1 [13] -1/-1/-1->0->-1 [14] -1/-1/-1->0->-1 [15] -1/-1/-1->0->-1 [16] -1/-1/-1->0->-1 [17] -1/-1/-1->0->-1 [18] -1/-1/-1->0->-1 [19] -1/-1/-1->0->-1 [20] -1/-1/-1->0->-1 [21] -1/-1/-1->0->-1 [22] -1/-1/-1->0->-1 [23] -1/-1/-1->0->-1 [24] -1/-1/-1->0->-1 [25] -1/-1/-1->0->-1 [26] -1/-1/-1->0->-1 [27] -1/-1/-1->0->-1 [28] -1/-1/-1->0->-1 [29] -1/-1/-1->0->-1 [30] -1/-1/-1->0->-1 [31] -1/-1/-1->0->-1 [32] -1/-1/-1->0->-1 [33] -1/-1/-1->0->-1 [34] -1/-1/-1->0->-1 [35] -1/-1/-1->0->-1 [36] -1/-1/-1->0->-1 [37] -1/-1/-1->0->-1 [38] -1/-1/-1->0->-1 [39] -1/-1/-1->0->-1 [40] -1/-1/-1->0->-1 [41] -1/-1/-1->0->-1 [42] -1/-1/-1->0->-1 [43] -1/-1/-1->0->-1 [44] -1/-1/-1->0->-1 [45] -1/-1/-1->0->-1 [46] -1/-1/-1->0
+lshn-qs-pjul-8:1217539:1218956 [2] NCCL INFO P2P Chunksize set to 524288
+lshn-qs-pjul-8:1217539:1218956 [2] NCCL INFO Check P2P Type isAllDirectP2p 0 directMode 0
+lshn-qs-pjul-8:1217539:1218960 [2] NCCL INFO [Proxy Service] Device 2 CPU core 151
+lshn-qs-pjul-8:1217539:1218961 [2] NCCL INFO [Proxy Service UDS] Device 2 CPU core 63
+lshn-qs-pjul-8:1217539:1218956 [2] NCCL INFO 64 coll channels, 64 collnet channels, 0 nvls channels, 64 p2p channels, 64 p2p channels per peer
+lshn-qs-pjul-8:1217539:1218956 [2] NCCL INFO CC Off, workFifoBytes 1048576
+lshn-qs-pjul-8:1217539:1218956 [2] NCCL INFO ncclCommSplit comm 0x206f1f00 rank 0 nranks 1 cudaDev 2 nvmlDev 6 busId 1a3000 parent 0x1dced410 splitCount 20 color 1227022723 key 0 - Init COMPLETE
+lshn-qs-pjul-8:1217539:1218956 [2] NCCL INFO Init timings - ncclCommSplit: rank 0 nranks 1 total 0.03 (kernels 0.00, alloc 0.00, bootstrap 0.00, allgathers 0.00, topo 0.01, graphs 0.00, connections 0.02, rest 0.00)
+[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
+lshn-qs-pjul-8:1217539:1217539 [2] NCCL INFO Rank 2 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-pjul-8:1217538:1217538 [1] NCCL INFO Rank 1 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-pjul-8:1217539:1217539 [2] NCCL INFO Rank 2 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-pjul-8:1217537:1217537 [0] NCCL INFO Rank 0 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-pjul-8:1217540:1217540 [3] NCCL INFO Comm config Blocking set to 1
+lshn-qs-pjul-8:1217540:1218971 [3] NCCL INFO Assigned NET plugin Socket to comm
+lshn-qs-pjul-8:1217540:1218971 [3] NCCL INFO Using network Socket
+lshn-qs-pjul-8:1217539:1217539 [2] NCCL INFO Rank 2 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-pjul-8:1217538:1217538 [1] NCCL INFO Rank 1 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-pjul-8:1217537:1217537 [0] NCCL INFO Comm config Blocking set to 1
+lshn-qs-pjul-8:1217540:1218971 [3] NCCL INFO ncclCommSplit comm 0x21b2d3c0 rank 0 nranks 1 cudaDev 3 nvmlDev 7 busId 1c7000 parent 0x1f12a6e0 splitCount 22 color 1301067556 key 0- Init START
+lshn-qs-pjul-8:1217540:1218971 [3] NCCL INFO MNNVL busId 0x1c7000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0
+lshn-qs-pjul-8:1217540:1218971 [3] NCCL INFO Setting affinity for GPU 7 to 48-95,144-191
+lshn-qs-pjul-8:1217540:1218971 [3] NCCL INFO comm 0x21b2d3c0 rank 0 nRanks 1 nNodes 1 localRanks 1 localRank 0 MNNVL 0
+lshn-qs-pjul-8:1217540:1218971 [3] NCCL INFO Channel 00/64 : 0
+lshn-qs-pjul-8:1217540:1218971 [3] NCCL INFO Channel 01/64 : 0
+lshn-qs-pjul-8:1217540:1218971 [3] NCCL INFO Channel 02/64 : 0
+lshn-qs-pjul-8:1217540:1218971 [3] NCCL INFO Channel 03/64 : 0
+lshn-qs-pjul-8:1217540:1218971 [3] NCCL INFO Channel 04/64 : 0
+lshn-qs-pjul-8:1217540:1218971 [3] NCCL INFO Channel 05/64 : 0
+lshn-qs-pjul-8:1217540:1218971 [3] NCCL INFO Channel 06/64 : 0
+lshn-qs-pjul-8:1217540:1218971 [3] NCCL INFO Channel 07/64 : 0
+lshn-qs-pjul-8:1217540:1218971 [3] NCCL INFO Channel 08/64 : 0
+lshn-qs-pjul-8:1217540:1218971 [3] NCCL INFO Channel 09/64 : 0
+lshn-qs-pjul-8:1217540:1218971 [3] NCCL INFO Channel 10/64 : 0
+lshn-qs-pjul-8:1217540:1218971 [3] NCCL INFO Channel 11/64 : 0
+lshn-qs-pjul-8:1217540:1218971 [3] NCCL INFO Channel 12/64 : 0
+lshn-qs-pjul-8:1217540:1218971 [3] NCCL INFO Channel 13/64 : 0
+lshn-qs-pjul-8:1217540:1218971 [3] NCCL INFO Channel 14/64 : 0
+lshn-qs-pjul-8:1217540:1218971 [3] NCCL INFO Channel 15/64 : 0
+lshn-qs-pjul-8:1217540:1218971 [3] NCCL INFO Channel 16/64 : 0
+lshn-qs-pjul-8:1217540:1218971 [3] NCCL INFO Channel 17/64 : 0
+lshn-qs-pjul-8:1217540:1218971 [3] NCCL INFO Channel 18/64 : 0
+lshn-qs-pjul-8:1217540:1218971 [3] NCCL INFO Channel 19/64 : 0
+lshn-qs-pjul-8:1217540:1218971 [3] NCCL INFO Channel 20/64 : 0
+lshn-qs-pjul-8:1217540:1218971 [3] NCCL INFO Channel 21/64 : 0
+lshn-qs-pjul-8:1217540:1218971 [3] NCCL INFO Channel 22/64 : 0
+lshn-qs-pjul-8:1217540:1218971 [3] NCCL INFO Channel 23/64 : 0
+lshn-qs-pjul-8:1217540:1218971 [3] NCCL INFO Channel 24/64 : 0
+lshn-qs-pjul-8:1217540:1218971 [3] NCCL INFO Channel 25/64 : 0
+lshn-qs-pjul-8:1217540:1218971 [3] NCCL INFO Channel 26/64 : 0
+lshn-qs-pjul-8:1217540:1218971 [3] NCCL INFO Channel 27/64 : 0
+lshn-qs-pjul-8:1217540:1218971 [3] NCCL INFO Channel 28/64 : 0
+lshn-qs-pjul-8:1217540:1218971 [3] NCCL INFO Channel 29/64 : 0
+lshn-qs-pjul-8:1217540:1218971 [3] NCCL INFO Channel 30/64 : 0
+lshn-qs-pjul-8:1217540:1218971 [3] NCCL INFO Channel 31/64 : 0
+lshn-qs-pjul-8:1217540:1218971 [3] NCCL INFO Channel 32/64 : 0
+lshn-qs-pjul-8:1217540:1218971 [3] NCCL INFO Channel 33/64 : 0
+lshn-qs-pjul-8:1217540:1218971 [3] NCCL INFO Channel 34/64 : 0
+lshn-qs-pjul-8:1217540:1218971 [3] NCCL INFO Channel 35/64 : 0
+lshn-qs-pjul-8:1217540:1218971 [3] NCCL INFO Channel 36/64 : 0
+lshn-qs-pjul-8:1217540:1218971 [3] NCCL INFO Channel 37/64 : 0
+lshn-qs-pjul-8:1217540:1218971 [3] NCCL INFO Channel 38/64 : 0
+lshn-qs-pjul-8:1217540:1218971 [3] NCCL INFO Channel 39/64 : 0
+lshn-qs-pjul-8:1217540:1218971 [3] NCCL INFO Channel 40/64 : 0
+lshn-qs-pjul-8:1217540:1218971 [3] NCCL INFO Channel 41/64 : 0
+lshn-qs-pjul-8:1217540:1218971 [3] NCCL INFO Channel 42/64 : 0
+lshn-qs-pjul-8:1217540:1218971 [3] NCCL INFO Channel 43/64 : 0
+lshn-qs-pjul-8:1217540:1218971 [3] NCCL INFO Channel 44/64 : 0
+lshn-qs-pjul-8:1217540:1218971 [3] NCCL INFO Channel 45/64 : 0
+lshn-qs-pjul-8:1217540:1218971 [3] NCCL INFO Channel 46/64 : 0
+lshn-qs-pjul-8:1217540:1218971 [3] NCCL INFO Channel 47/64 : 0
+lshn-qs-pjul-8:1217540:1218971 [3] NCCL INFO Channel 48/64 : 0
+lshn-qs-pjul-8:1217540:1218971 [3] NCCL INFO Channel 49/64 : 0
+lshn-qs-pjul-8:1217540:1218971 [3] NCCL INFO Channel 50/64 : 0
+lshn-qs-pjul-8:1217540:1218971 [3] NCCL INFO Channel 51/64 : 0
+lshn-qs-pjul-8:1217540:1218971 [3] NCCL INFO Channel 52/64 : 0
+lshn-qs-pjul-8:1217540:1218971 [3] NCCL INFO Channel 53/64 : 0
+lshn-qs-pjul-8:1217540:1218971 [3] NCCL INFO Channel 54/64 : 0
+lshn-qs-pjul-8:1217540:1218971 [3] NCCL INFO Channel 55/64 : 0
+lshn-qs-pjul-8:1217540:1218971 [3] NCCL INFO Channel 56/64 : 0
+lshn-qs-pjul-8:1217540:1218971 [3] NCCL INFO Channel 57/64 : 0
+lshn-qs-pjul-8:1217540:1218971 [3] NCCL INFO Channel 58/64 : 0
+lshn-qs-pjul-8:1217540:1218971 [3] NCCL INFO Channel 59/64 : 0
+lshn-qs-pjul-8:1217540:1218971 [3] NCCL INFO Channel 60/64 : 0
+lshn-qs-pjul-8:1217540:1218971 [3] NCCL INFO Channel 61/64 : 0
+lshn-qs-pjul-8:1217540:1218971 [3] NCCL INFO Channel 62/64 : 0
+lshn-qs-pjul-8:1217540:1218971 [3] NCCL INFO Channel 63/64 : 0
+lshn-qs-pjul-8:1217540:1218971 [3] NCCL INFO Trees [0] -1/-1/-1->0->-1 [1] -1/-1/-1->0->-1 [2] -1/-1/-1->0->-1 [3] -1/-1/-1->0->-1 [4] -1/-1/-1->0->-1 [5] -1/-1/-1->0->-1 [6] -1/-1/-1->0->-1 [7] -1/-1/-1->0->-1 [8] -1/-1/-1->0->-1 [9] -1/-1/-1->0->-1 [10] -1/-1/-1->0->-1 [11] -1/-1/-1->0->-1 [12] -1/-1/-1->0->-1 [13] -1/-1/-1->0->-1 [14] -1/-1/-1->0->-1 [15] -1/-1/-1->0->-1 [16] -1/-1/-1->0->-1 [17] -1/-1/-1->0->-1 [18] -1/-1/-1->0->-1 [19] -1/-1/-1->0->-1 [20] -1/-1/-1->0->-1 [21] -1/-1/-1->0->-1 [22] -1/-1/-1->0->-1 [23] -1/-1/-1->0->-1 [24] -1/-1/-1->0->-1 [25] -1/-1/-1->0->-1 [26] -1/-1/-1->0->-1 [27] -1/-1/-1->0->-1 [28] -1/-1/-1->0->-1 [29] -1/-1/-1->0->-1 [30] -1/-1/-1->0->-1 [31] -1/-1/-1->0->-1 [32] -1/-1/-1->0->-1 [33] -1/-1/-1->0->-1 [34] -1/-1/-1->0->-1 [35] -1/-1/-1->0->-1 [36] -1/-1/-1->0->-1 [37] -1/-1/-1->0->-1 [38] -1/-1/-1->0->-1 [39] -1/-1/-1->0->-1 [40] -1/-1/-1->0->-1 [41] -1/-1/-1->0->-1 [42] -1/-1/-1->0->-1 [43] -1/-1/-1->0->-1 [44] -1/-1/-1->0->-1 [45] -1/-1/-1->0->-1 [46] -1/-1/-1->0
+lshn-qs-pjul-8:1217540:1218971 [3] NCCL INFO P2P Chunksize set to 524288
+lshn-qs-pjul-8:1217540:1218971 [3] NCCL INFO Check P2P Type isAllDirectP2p 0 directMode 0
+lshn-qs-pjul-8:1217540:1218977 [3] NCCL INFO [Proxy Service] Device 3 CPU core 181
+lshn-qs-pjul-8:1217540:1218978 [3] NCCL INFO [Proxy Service UDS] Device 3 CPU core 189
+lshn-qs-pjul-8:1217540:1218971 [3] NCCL INFO 64 coll channels, 64 collnet channels, 0 nvls channels, 64 p2p channels, 64 p2p channels per peer
+lshn-qs-pjul-8:1217540:1218971 [3] NCCL INFO CC Off, workFifoBytes 1048576
+lshn-qs-pjul-8:1217540:1218971 [3] NCCL INFO ncclCommSplit comm 0x21b2d3c0 rank 0 nranks 1 cudaDev 3 nvmlDev 7 busId 1c7000 parent 0x1f12a6e0 splitCount 22 color 1301067556 key 0 - Init COMPLETE
+lshn-qs-pjul-8:1217540:1218971 [3] NCCL INFO Init timings - ncclCommSplit: rank 0 nranks 1 total 0.07 (kernels 0.00, alloc 0.00, bootstrap 0.00, allgathers 0.00, topo 0.01, graphs 0.00, connections 0.05, rest 0.00)
+[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
+lshn-qs-pjul-8:1217540:1217540 [3] NCCL INFO Rank 3 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-pjul-8:1217537:1218976 [0] NCCL INFO Assigned NET plugin Socket to comm
+lshn-qs-pjul-8:1217540:1217540 [3] NCCL INFO Rank 3 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-pjul-8:1217537:1218976 [0] NCCL INFO Using network Socket
+lshn-qs-pjul-8:1217539:1217539 [2] NCCL INFO Rank 2 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-pjul-8:1217538:1217538 [1] NCCL INFO Rank 1 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-pjul-8:1217537:1218976 [0] NCCL INFO ncclCommSplit comm 0x1ffcfe70 rank 0 nranks 1 cudaDev 0 nvmlDev 4 busId 109000 parent 0x1ce20510 splitCount 23 color 59908776 key 0- Init START
+lshn-qs-pjul-8:1217537:1218976 [0] NCCL INFO MNNVL busId 0x109000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0
+lshn-qs-pjul-8:1217537:1218976 [0] NCCL INFO Setting affinity for GPU 4 to 48-95,144-191
+lshn-qs-pjul-8:1217537:1218976 [0] NCCL INFO comm 0x1ffcfe70 rank 0 nRanks 1 nNodes 1 localRanks 1 localRank 0 MNNVL 0
+lshn-qs-pjul-8:1217537:1218976 [0] NCCL INFO Channel 00/64 : 0
+lshn-qs-pjul-8:1217537:1218976 [0] NCCL INFO Channel 01/64 : 0
+lshn-qs-pjul-8:1217537:1218976 [0] NCCL INFO Channel 02/64 : 0
+lshn-qs-pjul-8:1217537:1218976 [0] NCCL INFO Channel 03/64 : 0
+lshn-qs-pjul-8:1217537:1218976 [0] NCCL INFO Channel 04/64 : 0
+lshn-qs-pjul-8:1217537:1218976 [0] NCCL INFO Channel 05/64 : 0
+lshn-qs-pjul-8:1217537:1218976 [0] NCCL INFO Channel 06/64 : 0
+lshn-qs-pjul-8:1217537:1218976 [0] NCCL INFO Channel 07/64 : 0
+lshn-qs-pjul-8:1217537:1218976 [0] NCCL INFO Channel 08/64 : 0
+lshn-qs-pjul-8:1217537:1218976 [0] NCCL INFO Channel 09/64 : 0
+lshn-qs-pjul-8:1217537:1218976 [0] NCCL INFO Channel 10/64 : 0
+lshn-qs-pjul-8:1217537:1218976 [0] NCCL INFO Channel 11/64 : 0
+lshn-qs-pjul-8:1217537:1218976 [0] NCCL INFO Channel 12/64 : 0
+lshn-qs-pjul-8:1217537:1218976 [0] NCCL INFO Channel 13/64 : 0
+lshn-qs-pjul-8:1217537:1218976 [0] NCCL INFO Channel 14/64 : 0
+lshn-qs-pjul-8:1217537:1218976 [0] NCCL INFO Channel 15/64 : 0
+lshn-qs-pjul-8:1217537:1218976 [0] NCCL INFO Channel 16/64 : 0
+lshn-qs-pjul-8:1217537:1218976 [0] NCCL INFO Channel 17/64 : 0
+lshn-qs-pjul-8:1217537:1218976 [0] NCCL INFO Channel 18/64 : 0
+lshn-qs-pjul-8:1217537:1218976 [0] NCCL INFO Channel 19/64 : 0
+lshn-qs-pjul-8:1217537:1218976 [0] NCCL INFO Channel 20/64 : 0
+lshn-qs-pjul-8:1217537:1218976 [0] NCCL INFO Channel 21/64 : 0
+lshn-qs-pjul-8:1217537:1218976 [0] NCCL INFO Channel 22/64 : 0
+lshn-qs-pjul-8:1217537:1218976 [0] NCCL INFO Channel 23/64 : 0
+lshn-qs-pjul-8:1217537:1218976 [0] NCCL INFO Channel 24/64 : 0
+lshn-qs-pjul-8:1217537:1218976 [0] NCCL INFO Channel 25/64 : 0
+lshn-qs-pjul-8:1217537:1218976 [0] NCCL INFO Channel 26/64 : 0
+lshn-qs-pjul-8:1217537:1218976 [0] NCCL INFO Channel 27/64 : 0
+lshn-qs-pjul-8:1217537:1218976 [0] NCCL INFO Channel 28/64 : 0
+lshn-qs-pjul-8:1217537:1218976 [0] NCCL INFO Channel 29/64 : 0
+lshn-qs-pjul-8:1217537:1218976 [0] NCCL INFO Channel 30/64 : 0
+lshn-qs-pjul-8:1217537:1218976 [0] NCCL INFO Channel 31/64 : 0
+lshn-qs-pjul-8:1217537:1218976 [0] NCCL INFO Channel 32/64 : 0
+lshn-qs-pjul-8:1217537:1218976 [0] NCCL INFO Channel 33/64 : 0
+lshn-qs-pjul-8:1217537:1218976 [0] NCCL INFO Channel 34/64 : 0
+lshn-qs-pjul-8:1217537:1218976 [0] NCCL INFO Channel 35/64 : 0
+lshn-qs-pjul-8:1217537:1218976 [0] NCCL INFO Channel 36/64 : 0
+lshn-qs-pjul-8:1217537:1218976 [0] NCCL INFO Channel 37/64 : 0
+lshn-qs-pjul-8:1217537:1218976 [0] NCCL INFO Channel 38/64 : 0
+lshn-qs-pjul-8:1217537:1218976 [0] NCCL INFO Channel 39/64 : 0
+lshn-qs-pjul-8:1217537:1218976 [0] NCCL INFO Channel 40/64 : 0
+lshn-qs-pjul-8:1217537:1218976 [0] NCCL INFO Channel 41/64 : 0
+lshn-qs-pjul-8:1217537:1218976 [0] NCCL INFO Channel 42/64 : 0
+lshn-qs-pjul-8:1217537:1218976 [0] NCCL INFO Channel 43/64 : 0
+lshn-qs-pjul-8:1217537:1218976 [0] NCCL INFO Channel 44/64 : 0
+lshn-qs-pjul-8:1217537:1218976 [0] NCCL INFO Channel 45/64 : 0
+lshn-qs-pjul-8:1217537:1218976 [0] NCCL INFO Channel 46/64 : 0
+lshn-qs-pjul-8:1217537:1218976 [0] NCCL INFO Channel 47/64 : 0
+lshn-qs-pjul-8:1217537:1218976 [0] NCCL INFO Channel 48/64 : 0
+lshn-qs-pjul-8:1217537:1218976 [0] NCCL INFO Channel 49/64 : 0
+lshn-qs-pjul-8:1217537:1218976 [0] NCCL INFO Channel 50/64 : 0
+lshn-qs-pjul-8:1217537:1218976 [0] NCCL INFO Channel 51/64 : 0
+lshn-qs-pjul-8:1217537:1218976 [0] NCCL INFO Channel 52/64 : 0
+lshn-qs-pjul-8:1217537:1218976 [0] NCCL INFO Channel 53/64 : 0
+lshn-qs-pjul-8:1217537:1218976 [0] NCCL INFO Channel 54/64 : 0
+lshn-qs-pjul-8:1217537:1218976 [0] NCCL INFO Channel 55/64 : 0
+lshn-qs-pjul-8:1217537:1218976 [0] NCCL INFO Channel 56/64 : 0
+lshn-qs-pjul-8:1217537:1218976 [0] NCCL INFO Channel 57/64 : 0
+lshn-qs-pjul-8:1217537:1218976 [0] NCCL INFO Channel 58/64 : 0
+lshn-qs-pjul-8:1217537:1218976 [0] NCCL INFO Channel 59/64 : 0
+lshn-qs-pjul-8:1217537:1218976 [0] NCCL INFO Channel 60/64 : 0
+lshn-qs-pjul-8:1217537:1218976 [0] NCCL INFO Channel 61/64 : 0
+lshn-qs-pjul-8:1217537:1218976 [0] NCCL INFO Channel 62/64 : 0
+lshn-qs-pjul-8:1217537:1218976 [0] NCCL INFO Channel 63/64 : 0
+lshn-qs-pjul-8:1217537:1218976 [0] NCCL INFO Trees [0] -1/-1/-1->0->-1 [1] -1/-1/-1->0->-1 [2] -1/-1/-1->0->-1 [3] -1/-1/-1->0->-1 [4] -1/-1/-1->0->-1 [5] -1/-1/-1->0->-1 [6] -1/-1/-1->0->-1 [7] -1/-1/-1->0->-1 [8] -1/-1/-1->0->-1 [9] -1/-1/-1->0->-1 [10] -1/-1/-1->0->-1 [11] -1/-1/-1->0->-1 [12] -1/-1/-1->0->-1 [13] -1/-1/-1->0->-1 [14] -1/-1/-1->0->-1 [15] -1/-1/-1->0->-1 [16] -1/-1/-1->0->-1 [17] -1/-1/-1->0->-1 [18] -1/-1/-1->0->-1 [19] -1/-1/-1->0->-1 [20] -1/-1/-1->0->-1 [21] -1/-1/-1->0->-1 [22] -1/-1/-1->0->-1 [23] -1/-1/-1->0->-1 [24] -1/-1/-1->0->-1 [25] -1/-1/-1->0->-1 [26] -1/-1/-1->0->-1 [27] -1/-1/-1->0->-1 [28] -1/-1/-1->0->-1 [29] -1/-1/-1->0->-1 [30] -1/-1/-1->0->-1 [31] -1/-1/-1->0->-1 [32] -1/-1/-1->0->-1 [33] -1/-1/-1->0->-1 [34] -1/-1/-1->0->-1 [35] -1/-1/-1->0->-1 [36] -1/-1/-1->0->-1 [37] -1/-1/-1->0->-1 [38] -1/-1/-1->0->-1 [39] -1/-1/-1->0->-1 [40] -1/-1/-1->0->-1 [41] -1/-1/-1->0->-1 [42] -1/-1/-1->0->-1 [43] -1/-1/-1->0->-1 [44] -1/-1/-1->0->-1 [45] -1/-1/-1->0->-1 [46] -1/-1/-1->0
+lshn-qs-pjul-8:1217537:1218976 [0] NCCL INFO P2P Chunksize set to 524288
+lshn-qs-pjul-8:1217537:1218976 [0] NCCL INFO Check P2P Type isAllDirectP2p 0 directMode 0
+lshn-qs-pjul-8:1217537:1218986 [0] NCCL INFO [Proxy Service] Device 0 CPU core 53
+lshn-qs-pjul-8:1217537:1218987 [0] NCCL INFO [Proxy Service UDS] Device 0 CPU core 62
+lshn-qs-pjul-8:1217537:1218976 [0] NCCL INFO 64 coll channels, 64 collnet channels, 0 nvls channels, 64 p2p channels, 64 p2p channels per peer
+lshn-qs-pjul-8:1217537:1218976 [0] NCCL INFO CC Off, workFifoBytes 1048576
+lshn-qs-pjul-8:1217537:1218976 [0] NCCL INFO ncclCommSplit comm 0x1ffcfe70 rank 0 nranks 1 cudaDev 0 nvmlDev 4 busId 109000 parent 0x1ce20510 splitCount 23 color 59908776 key 0 - Init COMPLETE
+lshn-qs-pjul-8:1217537:1218976 [0] NCCL INFO Init timings - ncclCommSplit: rank 0 nranks 1 total 0.13 (kernels 0.00, alloc 0.00, bootstrap 0.00, allgathers 0.00, topo 0.01, graphs 0.00, connections 0.04, rest 0.08)
+[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
+lshn-qs-pjul-8:1217537:1217537 [0] NCCL INFO Rank 0 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-pjul-8:1217539:1217539 [2] NCCL INFO Rank 2 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-pjul-8:1217540:1217540 [3] NCCL INFO Rank 3 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-pjul-8:1217537:1217537 [0] NCCL INFO Rank 0 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-pjul-8:1217538:1217538 [1] NCCL INFO Comm config Blocking set to 1
+lshn-qs-pjul-8:1217538:1218997 [1] NCCL INFO Assigned NET plugin Socket to comm
+lshn-qs-pjul-8:1217538:1218997 [1] NCCL INFO Using network Socket
+lshn-qs-pjul-8:1217539:1217539 [2] NCCL INFO Rank 2 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-pjul-8:1217537:1217537 [0] NCCL INFO Rank 0 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-pjul-8:1217540:1217540 [3] NCCL INFO Rank 3 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-pjul-8:1217538:1218997 [1] NCCL INFO ncclCommSplit comm 0x2188fbe0 rank 0 nranks 1 cudaDev 1 nvmlDev 5 busId 17f000 parent 0x1ed808d0 splitCount 25 color 440515407 key 0- Init START
+lshn-qs-pjul-8:1217538:1218997 [1] NCCL INFO MNNVL busId 0x17f000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0
+lshn-qs-pjul-8:1217538:1218997 [1] NCCL INFO Setting affinity for GPU 5 to 48-95,144-191
+lshn-qs-pjul-8:1217538:1218997 [1] NCCL INFO comm 0x2188fbe0 rank 0 nRanks 1 nNodes 1 localRanks 1 localRank 0 MNNVL 0
+lshn-qs-pjul-8:1217538:1218997 [1] NCCL INFO Channel 00/64 : 0
+lshn-qs-pjul-8:1217538:1218997 [1] NCCL INFO Channel 01/64 : 0
+lshn-qs-pjul-8:1217538:1218997 [1] NCCL INFO Channel 02/64 : 0
+lshn-qs-pjul-8:1217538:1218997 [1] NCCL INFO Channel 03/64 : 0
+lshn-qs-pjul-8:1217538:1218997 [1] NCCL INFO Channel 04/64 : 0
+lshn-qs-pjul-8:1217538:1218997 [1] NCCL INFO Channel 05/64 : 0
+lshn-qs-pjul-8:1217538:1218997 [1] NCCL INFO Channel 06/64 : 0
+lshn-qs-pjul-8:1217538:1218997 [1] NCCL INFO Channel 07/64 : 0
+lshn-qs-pjul-8:1217538:1218997 [1] NCCL INFO Channel 08/64 : 0
+lshn-qs-pjul-8:1217538:1218997 [1] NCCL INFO Channel 09/64 : 0
+lshn-qs-pjul-8:1217538:1218997 [1] NCCL INFO Channel 10/64 : 0
+lshn-qs-pjul-8:1217538:1218997 [1] NCCL INFO Channel 11/64 : 0
+lshn-qs-pjul-8:1217538:1218997 [1] NCCL INFO Channel 12/64 : 0
+lshn-qs-pjul-8:1217538:1218997 [1] NCCL INFO Channel 13/64 : 0
+lshn-qs-pjul-8:1217538:1218997 [1] NCCL INFO Channel 14/64 : 0
+lshn-qs-pjul-8:1217538:1218997 [1] NCCL INFO Channel 15/64 : 0
+lshn-qs-pjul-8:1217538:1218997 [1] NCCL INFO Channel 16/64 : 0
+lshn-qs-pjul-8:1217538:1218997 [1] NCCL INFO Channel 17/64 : 0
+lshn-qs-pjul-8:1217538:1218997 [1] NCCL INFO Channel 18/64 : 0
+lshn-qs-pjul-8:1217538:1218997 [1] NCCL INFO Channel 19/64 : 0
+lshn-qs-pjul-8:1217538:1218997 [1] NCCL INFO Channel 20/64 : 0
+lshn-qs-pjul-8:1217538:1218997 [1] NCCL INFO Channel 21/64 : 0
+lshn-qs-pjul-8:1217538:1218997 [1] NCCL INFO Channel 22/64 : 0
+lshn-qs-pjul-8:1217538:1218997 [1] NCCL INFO Channel 23/64 : 0
+lshn-qs-pjul-8:1217538:1218997 [1] NCCL INFO Channel 24/64 : 0
+lshn-qs-pjul-8:1217538:1218997 [1] NCCL INFO Channel 25/64 : 0
+lshn-qs-pjul-8:1217538:1218997 [1] NCCL INFO Channel 26/64 : 0
+lshn-qs-pjul-8:1217538:1218997 [1] NCCL INFO Channel 27/64 : 0
+lshn-qs-pjul-8:1217538:1218997 [1] NCCL INFO Channel 28/64 : 0
+lshn-qs-pjul-8:1217538:1218997 [1] NCCL INFO Channel 29/64 : 0
+lshn-qs-pjul-8:1217538:1218997 [1] NCCL INFO Channel 30/64 : 0
+lshn-qs-pjul-8:1217538:1218997 [1] NCCL INFO Channel 31/64 : 0
+lshn-qs-pjul-8:1217538:1218997 [1] NCCL INFO Channel 32/64 : 0
+lshn-qs-pjul-8:1217538:1218997 [1] NCCL INFO Channel 33/64 : 0
+lshn-qs-pjul-8:1217538:1218997 [1] NCCL INFO Channel 34/64 : 0
+lshn-qs-pjul-8:1217538:1218997 [1] NCCL INFO Channel 35/64 : 0
+lshn-qs-pjul-8:1217538:1218997 [1] NCCL INFO Channel 36/64 : 0
+lshn-qs-pjul-8:1217538:1218997 [1] NCCL INFO Channel 37/64 : 0
+lshn-qs-pjul-8:1217538:1218997 [1] NCCL INFO Channel 38/64 : 0
+lshn-qs-pjul-8:1217538:1218997 [1] NCCL INFO Channel 39/64 : 0
+lshn-qs-pjul-8:1217538:1218997 [1] NCCL INFO Channel 40/64 : 0
+lshn-qs-pjul-8:1217538:1218997 [1] NCCL INFO Channel 41/64 : 0
+lshn-qs-pjul-8:1217538:1218997 [1] NCCL INFO Channel 42/64 : 0
+lshn-qs-pjul-8:1217538:1218997 [1] NCCL INFO Channel 43/64 : 0
+lshn-qs-pjul-8:1217538:1218997 [1] NCCL INFO Channel 44/64 : 0
+lshn-qs-pjul-8:1217538:1218997 [1] NCCL INFO Channel 45/64 : 0
+lshn-qs-pjul-8:1217538:1218997 [1] NCCL INFO Channel 46/64 : 0
+lshn-qs-pjul-8:1217538:1218997 [1] NCCL INFO Channel 47/64 : 0
+lshn-qs-pjul-8:1217538:1218997 [1] NCCL INFO Channel 48/64 : 0
+lshn-qs-pjul-8:1217538:1218997 [1] NCCL INFO Channel 49/64 : 0
+lshn-qs-pjul-8:1217538:1218997 [1] NCCL INFO Channel 50/64 : 0
+lshn-qs-pjul-8:1217538:1218997 [1] NCCL INFO Channel 51/64 : 0
+lshn-qs-pjul-8:1217538:1218997 [1] NCCL INFO Channel 52/64 : 0
+lshn-qs-pjul-8:1217538:1218997 [1] NCCL INFO Channel 53/64 : 0
+lshn-qs-pjul-8:1217538:1218997 [1] NCCL INFO Channel 54/64 : 0
+lshn-qs-pjul-8:1217538:1218997 [1] NCCL INFO Channel 55/64 : 0
+lshn-qs-pjul-8:1217538:1218997 [1] NCCL INFO Channel 56/64 : 0
+lshn-qs-pjul-8:1217538:1218997 [1] NCCL INFO Channel 57/64 : 0
+lshn-qs-pjul-8:1217538:1218997 [1] NCCL INFO Channel 58/64 : 0
+lshn-qs-pjul-8:1217538:1218997 [1] NCCL INFO Channel 59/64 : 0
+lshn-qs-pjul-8:1217538:1218997 [1] NCCL INFO Channel 60/64 : 0
+lshn-qs-pjul-8:1217538:1218997 [1] NCCL INFO Channel 61/64 : 0
+lshn-qs-pjul-8:1217538:1218997 [1] NCCL INFO Channel 62/64 : 0
+lshn-qs-pjul-8:1217538:1218997 [1] NCCL INFO Channel 63/64 : 0
+lshn-qs-pjul-8:1217538:1218997 [1] NCCL INFO Trees [0] -1/-1/-1->0->-1 [1] -1/-1/-1->0->-1 [2] -1/-1/-1->0->-1 [3] -1/-1/-1->0->-1 [4] -1/-1/-1->0->-1 [5] -1/-1/-1->0->-1 [6] -1/-1/-1->0->-1 [7] -1/-1/-1->0->-1 [8] -1/-1/-1->0->-1 [9] -1/-1/-1->0->-1 [10] -1/-1/-1->0->-1 [11] -1/-1/-1->0->-1 [12] -1/-1/-1->0->-1 [13] -1/-1/-1->0->-1 [14] -1/-1/-1->0->-1 [15] -1/-1/-1->0->-1 [16] -1/-1/-1->0->-1 [17] -1/-1/-1->0->-1 [18] -1/-1/-1->0->-1 [19] -1/-1/-1->0->-1 [20] -1/-1/-1->0->-1 [21] -1/-1/-1->0->-1 [22] -1/-1/-1->0->-1 [23] -1/-1/-1->0->-1 [24] -1/-1/-1->0->-1 [25] -1/-1/-1->0->-1 [26] -1/-1/-1->0->-1 [27] -1/-1/-1->0->-1 [28] -1/-1/-1->0->-1 [29] -1/-1/-1->0->-1 [30] -1/-1/-1->0->-1 [31] -1/-1/-1->0->-1 [32] -1/-1/-1->0->-1 [33] -1/-1/-1->0->-1 [34] -1/-1/-1->0->-1 [35] -1/-1/-1->0->-1 [36] -1/-1/-1->0->-1 [37] -1/-1/-1->0->-1 [38] -1/-1/-1->0->-1 [39] -1/-1/-1->0->-1 [40] -1/-1/-1->0->-1 [41] -1/-1/-1->0->-1 [42] -1/-1/-1->0->-1 [43] -1/-1/-1->0->-1 [44] -1/-1/-1->0->-1 [45] -1/-1/-1->0->-1 [46] -1/-1/-1->0
+lshn-qs-pjul-8:1217538:1218997 [1] NCCL INFO P2P Chunksize set to 524288
+lshn-qs-pjul-8:1217538:1218997 [1] NCCL INFO Check P2P Type isAllDirectP2p 0 directMode 0
+lshn-qs-pjul-8:1217538:1219001 [1] NCCL INFO [Proxy Service] Device 1 CPU core 184
+lshn-qs-pjul-8:1217538:1219002 [1] NCCL INFO [Proxy Service UDS] Device 1 CPU core 48
+lshn-qs-pjul-8:1217538:1218997 [1] NCCL INFO 64 coll channels, 64 collnet channels, 0 nvls channels, 64 p2p channels, 64 p2p channels per peer
+lshn-qs-pjul-8:1217538:1218997 [1] NCCL INFO CC Off, workFifoBytes 1048576
+lshn-qs-pjul-8:1217538:1218997 [1] NCCL INFO ncclCommSplit comm 0x2188fbe0 rank 0 nranks 1 cudaDev 1 nvmlDev 5 busId 17f000 parent 0x1ed808d0 splitCount 25 color 440515407 key 0 - Init COMPLETE
+lshn-qs-pjul-8:1217538:1218997 [1] NCCL INFO Init timings - ncclCommSplit: rank 0 nranks 1 total 0.06 (kernels 0.00, alloc 0.00, bootstrap 0.00, allgathers 0.02, topo 0.01, graphs 0.00, connections 0.03, rest 0.00)
+[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
+lshn-qs-pjul-8:1217538:1217538 [1] NCCL INFO Rank 1 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-pjul-8:1217538:1217538 [1] NCCL INFO Rank 1 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-pjul-8:1217540:1217540 [3] NCCL INFO Rank 3 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-pjul-8:1217537:1217537 [0] NCCL INFO Rank 0 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-pjul-8:1217539:1217539 [2] NCCL INFO Comm config Blocking set to 1
+lshn-qs-pjul-8:1217539:1219012 [2] NCCL INFO Assigned NET plugin Socket to comm
+lshn-qs-pjul-8:1217539:1219012 [2] NCCL INFO Using network Socket
+lshn-qs-pjul-8:1217538:1217538 [1] NCCL INFO Rank 1 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-pjul-8:1217537:1217537 [0] NCCL INFO Rank 0 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-pjul-8:1217540:1217540 [3] NCCL INFO Rank 3 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-pjul-8:1217539:1219012 [2] NCCL INFO ncclCommSplit comm 0x207f9b10 rank 0 nranks 1 cudaDev 2 nvmlDev 6 busId 1a3000 parent 0x1dced410 splitCount 27 color 1227022723 key 0- Init START
+lshn-qs-pjul-8:1217539:1219012 [2] NCCL INFO MNNVL busId 0x1a3000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0
+lshn-qs-pjul-8:1217539:1219012 [2] NCCL INFO Setting affinity for GPU 6 to 48-95,144-191
+lshn-qs-pjul-8:1217539:1219012 [2] NCCL INFO comm 0x207f9b10 rank 0 nRanks 1 nNodes 1 localRanks 1 localRank 0 MNNVL 0
+lshn-qs-pjul-8:1217539:1219012 [2] NCCL INFO Channel 00/64 : 0
+lshn-qs-pjul-8:1217539:1219012 [2] NCCL INFO Channel 01/64 : 0
+lshn-qs-pjul-8:1217539:1219012 [2] NCCL INFO Channel 02/64 : 0
+lshn-qs-pjul-8:1217539:1219012 [2] NCCL INFO Channel 03/64 : 0
+lshn-qs-pjul-8:1217539:1219012 [2] NCCL INFO Channel 04/64 : 0
+lshn-qs-pjul-8:1217539:1219012 [2] NCCL INFO Channel 05/64 : 0
+lshn-qs-pjul-8:1217539:1219012 [2] NCCL INFO Channel 06/64 : 0
+lshn-qs-pjul-8:1217539:1219012 [2] NCCL INFO Channel 07/64 : 0
+lshn-qs-pjul-8:1217539:1219012 [2] NCCL INFO Channel 08/64 : 0
+lshn-qs-pjul-8:1217539:1219012 [2] NCCL INFO Channel 09/64 : 0
+lshn-qs-pjul-8:1217539:1219012 [2] NCCL INFO Channel 10/64 : 0
+lshn-qs-pjul-8:1217539:1219012 [2] NCCL INFO Channel 11/64 : 0
+lshn-qs-pjul-8:1217539:1219012 [2] NCCL INFO Channel 12/64 : 0
+lshn-qs-pjul-8:1217539:1219012 [2] NCCL INFO Channel 13/64 : 0
+lshn-qs-pjul-8:1217539:1219012 [2] NCCL INFO Channel 14/64 : 0
+lshn-qs-pjul-8:1217539:1219012 [2] NCCL INFO Channel 15/64 : 0
+lshn-qs-pjul-8:1217539:1219012 [2] NCCL INFO Channel 16/64 : 0
+lshn-qs-pjul-8:1217539:1219012 [2] NCCL INFO Channel 17/64 : 0
+lshn-qs-pjul-8:1217539:1219012 [2] NCCL INFO Channel 18/64 : 0
+lshn-qs-pjul-8:1217539:1219012 [2] NCCL INFO Channel 19/64 : 0
+lshn-qs-pjul-8:1217539:1219012 [2] NCCL INFO Channel 20/64 : 0
+lshn-qs-pjul-8:1217539:1219012 [2] NCCL INFO Channel 21/64 : 0
+lshn-qs-pjul-8:1217539:1219012 [2] NCCL INFO Channel 22/64 : 0
+lshn-qs-pjul-8:1217539:1219012 [2] NCCL INFO Channel 23/64 : 0
+lshn-qs-pjul-8:1217539:1219012 [2] NCCL INFO Channel 24/64 : 0
+lshn-qs-pjul-8:1217539:1219012 [2] NCCL INFO Channel 25/64 : 0
+lshn-qs-pjul-8:1217539:1219012 [2] NCCL INFO Channel 26/64 : 0
+lshn-qs-pjul-8:1217539:1219012 [2] NCCL INFO Channel 27/64 : 0
+lshn-qs-pjul-8:1217539:1219012 [2] NCCL INFO Channel 28/64 : 0
+lshn-qs-pjul-8:1217539:1219012 [2] NCCL INFO Channel 29/64 : 0
+lshn-qs-pjul-8:1217539:1219012 [2] NCCL INFO Channel 30/64 : 0
+lshn-qs-pjul-8:1217539:1219012 [2] NCCL INFO Channel 31/64 : 0
+lshn-qs-pjul-8:1217539:1219012 [2] NCCL INFO Channel 32/64 : 0
+lshn-qs-pjul-8:1217539:1219012 [2] NCCL INFO Channel 33/64 : 0
+lshn-qs-pjul-8:1217539:1219012 [2] NCCL INFO Channel 34/64 : 0
+lshn-qs-pjul-8:1217539:1219012 [2] NCCL INFO Channel 35/64 : 0
+lshn-qs-pjul-8:1217539:1219012 [2] NCCL INFO Channel 36/64 : 0
+lshn-qs-pjul-8:1217539:1219012 [2] NCCL INFO Channel 37/64 : 0
+lshn-qs-pjul-8:1217539:1219012 [2] NCCL INFO Channel 38/64 : 0
+lshn-qs-pjul-8:1217539:1219012 [2] NCCL INFO Channel 39/64 : 0
+lshn-qs-pjul-8:1217539:1219012 [2] NCCL INFO Channel 40/64 : 0
+lshn-qs-pjul-8:1217539:1219012 [2] NCCL INFO Channel 41/64 : 0
+lshn-qs-pjul-8:1217539:1219012 [2] NCCL INFO Channel 42/64 : 0
+lshn-qs-pjul-8:1217539:1219012 [2] NCCL INFO Channel 43/64 : 0
+lshn-qs-pjul-8:1217539:1219012 [2] NCCL INFO Channel 44/64 : 0
+lshn-qs-pjul-8:1217539:1219012 [2] NCCL INFO Channel 45/64 : 0
+lshn-qs-pjul-8:1217539:1219012 [2] NCCL INFO Channel 46/64 : 0
+lshn-qs-pjul-8:1217539:1219012 [2] NCCL INFO Channel 47/64 : 0
+lshn-qs-pjul-8:1217539:1219012 [2] NCCL INFO Channel 48/64 : 0
+lshn-qs-pjul-8:1217539:1219012 [2] NCCL INFO Channel 49/64 : 0
+lshn-qs-pjul-8:1217539:1219012 [2] NCCL INFO Channel 50/64 : 0
+lshn-qs-pjul-8:1217539:1219012 [2] NCCL INFO Channel 51/64 : 0
+lshn-qs-pjul-8:1217539:1219012 [2] NCCL INFO Channel 52/64 : 0
+lshn-qs-pjul-8:1217539:1219012 [2] NCCL INFO Channel 53/64 : 0
+lshn-qs-pjul-8:1217539:1219012 [2] NCCL INFO Channel 54/64 : 0
+lshn-qs-pjul-8:1217539:1219012 [2] NCCL INFO Channel 55/64 : 0
+lshn-qs-pjul-8:1217539:1219012 [2] NCCL INFO Channel 56/64 : 0
+lshn-qs-pjul-8:1217539:1219012 [2] NCCL INFO Channel 57/64 : 0
+lshn-qs-pjul-8:1217539:1219012 [2] NCCL INFO Channel 58/64 : 0
+lshn-qs-pjul-8:1217539:1219012 [2] NCCL INFO Channel 59/64 : 0
+lshn-qs-pjul-8:1217539:1219012 [2] NCCL INFO Channel 60/64 : 0
+lshn-qs-pjul-8:1217539:1219012 [2] NCCL INFO Channel 61/64 : 0
+lshn-qs-pjul-8:1217539:1219012 [2] NCCL INFO Channel 62/64 : 0
+lshn-qs-pjul-8:1217539:1219012 [2] NCCL INFO Channel 63/64 : 0
+lshn-qs-pjul-8:1217539:1219012 [2] NCCL INFO Trees [0] -1/-1/-1->0->-1 [1] -1/-1/-1->0->-1 [2] -1/-1/-1->0->-1 [3] -1/-1/-1->0->-1 [4] -1/-1/-1->0->-1 [5] -1/-1/-1->0->-1 [6] -1/-1/-1->0->-1 [7] -1/-1/-1->0->-1 [8] -1/-1/-1->0->-1 [9] -1/-1/-1->0->-1 [10] -1/-1/-1->0->-1 [11] -1/-1/-1->0->-1 [12] -1/-1/-1->0->-1 [13] -1/-1/-1->0->-1 [14] -1/-1/-1->0->-1 [15] -1/-1/-1->0->-1 [16] -1/-1/-1->0->-1 [17] -1/-1/-1->0->-1 [18] -1/-1/-1->0->-1 [19] -1/-1/-1->0->-1 [20] -1/-1/-1->0->-1 [21] -1/-1/-1->0->-1 [22] -1/-1/-1->0->-1 [23] -1/-1/-1->0->-1 [24] -1/-1/-1->0->-1 [25] -1/-1/-1->0->-1 [26] -1/-1/-1->0->-1 [27] -1/-1/-1->0->-1 [28] -1/-1/-1->0->-1 [29] -1/-1/-1->0->-1 [30] -1/-1/-1->0->-1 [31] -1/-1/-1->0->-1 [32] -1/-1/-1->0->-1 [33] -1/-1/-1->0->-1 [34] -1/-1/-1->0->-1 [35] -1/-1/-1->0->-1 [36] -1/-1/-1->0->-1 [37] -1/-1/-1->0->-1 [38] -1/-1/-1->0->-1 [39] -1/-1/-1->0->-1 [40] -1/-1/-1->0->-1 [41] -1/-1/-1->0->-1 [42] -1/-1/-1->0->-1 [43] -1/-1/-1->0->-1 [44] -1/-1/-1->0->-1 [45] -1/-1/-1->0->-1 [46] -1/-1/-1->0
+lshn-qs-pjul-8:1217539:1219012 [2] NCCL INFO P2P Chunksize set to 524288
+lshn-qs-pjul-8:1217539:1219012 [2] NCCL INFO Check P2P Type isAllDirectP2p 0 directMode 0
+lshn-qs-pjul-8:1217539:1219016 [2] NCCL INFO [Proxy Service] Device 2 CPU core 157
+lshn-qs-pjul-8:1217539:1219017 [2] NCCL INFO [Proxy Service UDS] Device 2 CPU core 67
+lshn-qs-pjul-8:1217539:1219012 [2] NCCL INFO 64 coll channels, 64 collnet channels, 0 nvls channels, 64 p2p channels, 64 p2p channels per peer
+lshn-qs-pjul-8:1217539:1219012 [2] NCCL INFO CC Off, workFifoBytes 1048576
+lshn-qs-pjul-8:1217539:1219012 [2] NCCL INFO ncclCommSplit comm 0x207f9b10 rank 0 nranks 1 cudaDev 2 nvmlDev 6 busId 1a3000 parent 0x1dced410 splitCount 27 color 1227022723 key 0 - Init COMPLETE
+lshn-qs-pjul-8:1217539:1219012 [2] NCCL INFO Init timings - ncclCommSplit: rank 0 nranks 1 total 0.10 (kernels 0.00, alloc 0.00, bootstrap 0.00, allgathers 0.00, topo 0.03, graphs 0.00, connections 0.07, rest 0.00)
+[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
+lshn-qs-pjul-8:1217539:1217539 [2] NCCL INFO Rank 2 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-pjul-8:1217539:1217539 [2] NCCL INFO Rank 2 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-pjul-8:1217537:1217537 [0] NCCL INFO Rank 0 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-pjul-8:1217538:1217538 [1] NCCL INFO Rank 1 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-pjul-8:1217540:1217540 [3] NCCL INFO Comm config Blocking set to 1
+lshn-qs-pjul-8:1217540:1219027 [3] NCCL INFO Assigned NET plugin Socket to comm
+lshn-qs-pjul-8:1217540:1219027 [3] NCCL INFO Using network Socket
+lshn-qs-pjul-8:1217539:1217539 [2] NCCL INFO Rank 2 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-pjul-8:1217538:1217538 [1] NCCL INFO Rank 1 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-pjul-8:1217540:1219027 [3] NCCL INFO ncclCommSplit comm 0x21c34fd0 rank 0 nranks 1 cudaDev 3 nvmlDev 7 busId 1c7000 parent 0x1f12a6e0 splitCount 29 color 1301067556 key 0- Init START
+lshn-qs-pjul-8:1217537:1217537 [0] NCCL INFO Comm config Blocking set to 1
+lshn-qs-pjul-8:1217540:1219027 [3] NCCL INFO MNNVL busId 0x1c7000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0
+lshn-qs-pjul-8:1217540:1219027 [3] NCCL INFO Setting affinity for GPU 7 to 48-95,144-191
+lshn-qs-pjul-8:1217540:1219027 [3] NCCL INFO comm 0x21c34fd0 rank 0 nRanks 1 nNodes 1 localRanks 1 localRank 0 MNNVL 0
+lshn-qs-pjul-8:1217540:1219027 [3] NCCL INFO Channel 00/64 : 0
+lshn-qs-pjul-8:1217540:1219027 [3] NCCL INFO Channel 01/64 : 0
+lshn-qs-pjul-8:1217540:1219027 [3] NCCL INFO Channel 02/64 : 0
+lshn-qs-pjul-8:1217540:1219027 [3] NCCL INFO Channel 03/64 : 0
+lshn-qs-pjul-8:1217540:1219027 [3] NCCL INFO Channel 04/64 : 0
+lshn-qs-pjul-8:1217540:1219027 [3] NCCL INFO Channel 05/64 : 0
+lshn-qs-pjul-8:1217540:1219027 [3] NCCL INFO Channel 06/64 : 0
+lshn-qs-pjul-8:1217540:1219027 [3] NCCL INFO Channel 07/64 : 0
+lshn-qs-pjul-8:1217540:1219027 [3] NCCL INFO Channel 08/64 : 0
+lshn-qs-pjul-8:1217540:1219027 [3] NCCL INFO Channel 09/64 : 0
+lshn-qs-pjul-8:1217540:1219027 [3] NCCL INFO Channel 10/64 : 0
+lshn-qs-pjul-8:1217540:1219027 [3] NCCL INFO Channel 11/64 : 0
+lshn-qs-pjul-8:1217540:1219027 [3] NCCL INFO Channel 12/64 : 0
+lshn-qs-pjul-8:1217540:1219027 [3] NCCL INFO Channel 13/64 : 0
+lshn-qs-pjul-8:1217540:1219027 [3] NCCL INFO Channel 14/64 : 0
+lshn-qs-pjul-8:1217540:1219027 [3] NCCL INFO Channel 15/64 : 0
+lshn-qs-pjul-8:1217540:1219027 [3] NCCL INFO Channel 16/64 : 0
+lshn-qs-pjul-8:1217540:1219027 [3] NCCL INFO Channel 17/64 : 0
+lshn-qs-pjul-8:1217540:1219027 [3] NCCL INFO Channel 18/64 : 0
+lshn-qs-pjul-8:1217540:1219027 [3] NCCL INFO Channel 19/64 : 0
+lshn-qs-pjul-8:1217540:1219027 [3] NCCL INFO Channel 20/64 : 0
+lshn-qs-pjul-8:1217540:1219027 [3] NCCL INFO Channel 21/64 : 0
+lshn-qs-pjul-8:1217540:1219027 [3] NCCL INFO Channel 22/64 : 0
+lshn-qs-pjul-8:1217540:1219027 [3] NCCL INFO Channel 23/64 : 0
+lshn-qs-pjul-8:1217540:1219027 [3] NCCL INFO Channel 24/64 : 0
+lshn-qs-pjul-8:1217540:1219027 [3] NCCL INFO Channel 25/64 : 0
+lshn-qs-pjul-8:1217540:1219027 [3] NCCL INFO Channel 26/64 : 0
+lshn-qs-pjul-8:1217540:1219027 [3] NCCL INFO Channel 27/64 : 0
+lshn-qs-pjul-8:1217540:1219027 [3] NCCL INFO Channel 28/64 : 0
+lshn-qs-pjul-8:1217540:1219027 [3] NCCL INFO Channel 29/64 : 0
+lshn-qs-pjul-8:1217540:1219027 [3] NCCL INFO Channel 30/64 : 0
+lshn-qs-pjul-8:1217540:1219027 [3] NCCL INFO Channel 31/64 : 0
+lshn-qs-pjul-8:1217540:1219027 [3] NCCL INFO Channel 32/64 : 0
+lshn-qs-pjul-8:1217540:1219027 [3] NCCL INFO Channel 33/64 : 0
+lshn-qs-pjul-8:1217540:1219027 [3] NCCL INFO Channel 34/64 : 0
+lshn-qs-pjul-8:1217540:1219027 [3] NCCL INFO Channel 35/64 : 0
+lshn-qs-pjul-8:1217540:1219027 [3] NCCL INFO Channel 36/64 : 0
+lshn-qs-pjul-8:1217540:1219027 [3] NCCL INFO Channel 37/64 : 0
+lshn-qs-pjul-8:1217540:1219027 [3] NCCL INFO Channel 38/64 : 0
+lshn-qs-pjul-8:1217540:1219027 [3] NCCL INFO Channel 39/64 : 0
+lshn-qs-pjul-8:1217540:1219027 [3] NCCL INFO Channel 40/64 : 0
+lshn-qs-pjul-8:1217540:1219027 [3] NCCL INFO Channel 41/64 : 0
+lshn-qs-pjul-8:1217540:1219027 [3] NCCL INFO Channel 42/64 : 0
+lshn-qs-pjul-8:1217540:1219027 [3] NCCL INFO Channel 43/64 : 0
+lshn-qs-pjul-8:1217540:1219027 [3] NCCL INFO Channel 44/64 : 0
+lshn-qs-pjul-8:1217540:1219027 [3] NCCL INFO Channel 45/64 : 0
+lshn-qs-pjul-8:1217540:1219027 [3] NCCL INFO Channel 46/64 : 0
+lshn-qs-pjul-8:1217540:1219027 [3] NCCL INFO Channel 47/64 : 0
+lshn-qs-pjul-8:1217540:1219027 [3] NCCL INFO Channel 48/64 : 0
+lshn-qs-pjul-8:1217540:1219027 [3] NCCL INFO Channel 49/64 : 0
+lshn-qs-pjul-8:1217540:1219027 [3] NCCL INFO Channel 50/64 : 0
+lshn-qs-pjul-8:1217540:1219027 [3] NCCL INFO Channel 51/64 : 0
+lshn-qs-pjul-8:1217540:1219027 [3] NCCL INFO Channel 52/64 : 0
+lshn-qs-pjul-8:1217540:1219027 [3] NCCL INFO Channel 53/64 : 0
+lshn-qs-pjul-8:1217540:1219027 [3] NCCL INFO Channel 54/64 : 0
+lshn-qs-pjul-8:1217540:1219027 [3] NCCL INFO Channel 55/64 : 0
+lshn-qs-pjul-8:1217540:1219027 [3] NCCL INFO Channel 56/64 : 0
+lshn-qs-pjul-8:1217540:1219027 [3] NCCL INFO Channel 57/64 : 0
+lshn-qs-pjul-8:1217540:1219027 [3] NCCL INFO Channel 58/64 : 0
+lshn-qs-pjul-8:1217540:1219027 [3] NCCL INFO Channel 59/64 : 0
+lshn-qs-pjul-8:1217540:1219027 [3] NCCL INFO Channel 60/64 : 0
+lshn-qs-pjul-8:1217540:1219027 [3] NCCL INFO Channel 61/64 : 0
+lshn-qs-pjul-8:1217540:1219027 [3] NCCL INFO Channel 62/64 : 0
+lshn-qs-pjul-8:1217540:1219027 [3] NCCL INFO Channel 63/64 : 0
+lshn-qs-pjul-8:1217540:1219027 [3] NCCL INFO Trees [0] -1/-1/-1->0->-1 [1] -1/-1/-1->0->-1 [2] -1/-1/-1->0->-1 [3] -1/-1/-1->0->-1 [4] -1/-1/-1->0->-1 [5] -1/-1/-1->0->-1 [6] -1/-1/-1->0->-1 [7] -1/-1/-1->0->-1 [8] -1/-1/-1->0->-1 [9] -1/-1/-1->0->-1 [10] -1/-1/-1->0->-1 [11] -1/-1/-1->0->-1 [12] -1/-1/-1->0->-1 [13] -1/-1/-1->0->-1 [14] -1/-1/-1->0->-1 [15] -1/-1/-1->0->-1 [16] -1/-1/-1->0->-1 [17] -1/-1/-1->0->-1 [18] -1/-1/-1->0->-1 [19] -1/-1/-1->0->-1 [20] -1/-1/-1->0->-1 [21] -1/-1/-1->0->-1 [22] -1/-1/-1->0->-1 [23] -1/-1/-1->0->-1 [24] -1/-1/-1->0->-1 [25] -1/-1/-1->0->-1 [26] -1/-1/-1->0->-1 [27] -1/-1/-1->0->-1 [28] -1/-1/-1->0->-1 [29] -1/-1/-1->0->-1 [30] -1/-1/-1->0->-1 [31] -1/-1/-1->0->-1 [32] -1/-1/-1->0->-1 [33] -1/-1/-1->0->-1 [34] -1/-1/-1->0->-1 [35] -1/-1/-1->0->-1 [36] -1/-1/-1->0->-1 [37] -1/-1/-1->0->-1 [38] -1/-1/-1->0->-1 [39] -1/-1/-1->0->-1 [40] -1/-1/-1->0->-1 [41] -1/-1/-1->0->-1 [42] -1/-1/-1->0->-1 [43] -1/-1/-1->0->-1 [44] -1/-1/-1->0->-1 [45] -1/-1/-1->0->-1 [46] -1/-1/-1->0
+lshn-qs-pjul-8:1217540:1219027 [3] NCCL INFO P2P Chunksize set to 524288
+lshn-qs-pjul-8:1217540:1219027 [3] NCCL INFO Check P2P Type isAllDirectP2p 0 directMode 0
+lshn-qs-pjul-8:1217540:1219033 [3] NCCL INFO [Proxy Service] Device 3 CPU core 58
+lshn-qs-pjul-8:1217540:1219034 [3] NCCL INFO [Proxy Service UDS] Device 3 CPU core 74
+lshn-qs-pjul-8:1217540:1219027 [3] NCCL INFO 64 coll channels, 64 collnet channels, 0 nvls channels, 64 p2p channels, 64 p2p channels per peer
+lshn-qs-pjul-8:1217540:1219027 [3] NCCL INFO CC Off, workFifoBytes 1048576
+lshn-qs-pjul-8:1217540:1219027 [3] NCCL INFO ncclCommSplit comm 0x21c34fd0 rank 0 nranks 1 cudaDev 3 nvmlDev 7 busId 1c7000 parent 0x1f12a6e0 splitCount 29 color 1301067556 key 0 - Init COMPLETE
+lshn-qs-pjul-8:1217540:1219027 [3] NCCL INFO Init timings - ncclCommSplit: rank 0 nranks 1 total 0.04 (kernels 0.00, alloc 0.00, bootstrap 0.00, allgathers 0.00, topo 0.01, graphs 0.00, connections 0.03, rest 0.00)
+[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
+lshn-qs-pjul-8:1217540:1217540 [3] NCCL INFO Rank 3 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-pjul-8:1217537:1219032 [0] NCCL INFO Assigned NET plugin Socket to comm
+lshn-qs-pjul-8:1217537:1219032 [0] NCCL INFO Using network Socket
+lshn-qs-pjul-8:1217539:1217539 [2] NCCL INFO Rank 2 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-pjul-8:1217540:1217540 [3] NCCL INFO Rank 3 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-pjul-8:1217538:1217538 [1] NCCL INFO Rank 1 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-pjul-8:1217537:1219032 [0] NCCL INFO ncclCommSplit comm 0x200d7a80 rank 0 nranks 1 cudaDev 0 nvmlDev 4 busId 109000 parent 0x1ce20510 splitCount 30 color 59908776 key 0- Init START
+lshn-qs-pjul-8:1217537:1219032 [0] NCCL INFO MNNVL busId 0x109000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0
+lshn-qs-pjul-8:1217537:1219032 [0] NCCL INFO Setting affinity for GPU 4 to 48-95,144-191
+lshn-qs-pjul-8:1217537:1219032 [0] NCCL INFO comm 0x200d7a80 rank 0 nRanks 1 nNodes 1 localRanks 1 localRank 0 MNNVL 0
+lshn-qs-pjul-8:1217537:1219032 [0] NCCL INFO Channel 00/64 : 0
+lshn-qs-pjul-8:1217537:1219032 [0] NCCL INFO Channel 01/64 : 0
+lshn-qs-pjul-8:1217537:1219032 [0] NCCL INFO Channel 02/64 : 0
+lshn-qs-pjul-8:1217537:1219032 [0] NCCL INFO Channel 03/64 : 0
+lshn-qs-pjul-8:1217537:1219032 [0] NCCL INFO Channel 04/64 : 0
+lshn-qs-pjul-8:1217537:1219032 [0] NCCL INFO Channel 05/64 : 0
+lshn-qs-pjul-8:1217537:1219032 [0] NCCL INFO Channel 06/64 : 0
+lshn-qs-pjul-8:1217537:1219032 [0] NCCL INFO Channel 07/64 : 0
+lshn-qs-pjul-8:1217537:1219032 [0] NCCL INFO Channel 08/64 : 0
+lshn-qs-pjul-8:1217537:1219032 [0] NCCL INFO Channel 09/64 : 0
+lshn-qs-pjul-8:1217537:1219032 [0] NCCL INFO Channel 10/64 : 0
+lshn-qs-pjul-8:1217537:1219032 [0] NCCL INFO Channel 11/64 : 0
+lshn-qs-pjul-8:1217537:1219032 [0] NCCL INFO Channel 12/64 : 0
+lshn-qs-pjul-8:1217537:1219032 [0] NCCL INFO Channel 13/64 : 0
+lshn-qs-pjul-8:1217537:1219032 [0] NCCL INFO Channel 14/64 : 0
+lshn-qs-pjul-8:1217537:1219032 [0] NCCL INFO Channel 15/64 : 0
+lshn-qs-pjul-8:1217537:1219032 [0] NCCL INFO Channel 16/64 : 0
+lshn-qs-pjul-8:1217537:1219032 [0] NCCL INFO Channel 17/64 : 0
+lshn-qs-pjul-8:1217537:1219032 [0] NCCL INFO Channel 18/64 : 0
+lshn-qs-pjul-8:1217537:1219032 [0] NCCL INFO Channel 19/64 : 0
+lshn-qs-pjul-8:1217537:1219032 [0] NCCL INFO Channel 20/64 : 0
+lshn-qs-pjul-8:1217537:1219032 [0] NCCL INFO Channel 21/64 : 0
+lshn-qs-pjul-8:1217537:1219032 [0] NCCL INFO Channel 22/64 : 0
+lshn-qs-pjul-8:1217537:1219032 [0] NCCL INFO Channel 23/64 : 0
+lshn-qs-pjul-8:1217537:1219032 [0] NCCL INFO Channel 24/64 : 0
+lshn-qs-pjul-8:1217537:1219032 [0] NCCL INFO Channel 25/64 : 0
+lshn-qs-pjul-8:1217537:1219032 [0] NCCL INFO Channel 26/64 : 0
+lshn-qs-pjul-8:1217537:1219032 [0] NCCL INFO Channel 27/64 : 0
+lshn-qs-pjul-8:1217537:1219032 [0] NCCL INFO Channel 28/64 : 0
+lshn-qs-pjul-8:1217537:1219032 [0] NCCL INFO Channel 29/64 : 0
+lshn-qs-pjul-8:1217537:1219032 [0] NCCL INFO Channel 30/64 : 0
+lshn-qs-pjul-8:1217537:1219032 [0] NCCL INFO Channel 31/64 : 0
+lshn-qs-pjul-8:1217537:1219032 [0] NCCL INFO Channel 32/64 : 0
+lshn-qs-pjul-8:1217537:1219032 [0] NCCL INFO Channel 33/64 : 0
+lshn-qs-pjul-8:1217537:1219032 [0] NCCL INFO Channel 34/64 : 0
+lshn-qs-pjul-8:1217537:1219032 [0] NCCL INFO Channel 35/64 : 0
+lshn-qs-pjul-8:1217537:1219032 [0] NCCL INFO Channel 36/64 : 0
+lshn-qs-pjul-8:1217537:1219032 [0] NCCL INFO Channel 37/64 : 0
+lshn-qs-pjul-8:1217537:1219032 [0] NCCL INFO Channel 38/64 : 0
+lshn-qs-pjul-8:1217537:1219032 [0] NCCL INFO Channel 39/64 : 0
+lshn-qs-pjul-8:1217537:1219032 [0] NCCL INFO Channel 40/64 : 0
+lshn-qs-pjul-8:1217537:1219032 [0] NCCL INFO Channel 41/64 : 0
+lshn-qs-pjul-8:1217537:1219032 [0] NCCL INFO Channel 42/64 : 0
+lshn-qs-pjul-8:1217537:1219032 [0] NCCL INFO Channel 43/64 : 0
+lshn-qs-pjul-8:1217537:1219032 [0] NCCL INFO Channel 44/64 : 0
+lshn-qs-pjul-8:1217537:1219032 [0] NCCL INFO Channel 45/64 : 0
+lshn-qs-pjul-8:1217537:1219032 [0] NCCL INFO Channel 46/64 : 0
+lshn-qs-pjul-8:1217537:1219032 [0] NCCL INFO Channel 47/64 : 0
+lshn-qs-pjul-8:1217537:1219032 [0] NCCL INFO Channel 48/64 : 0
+lshn-qs-pjul-8:1217537:1219032 [0] NCCL INFO Channel 49/64 : 0
+lshn-qs-pjul-8:1217537:1219032 [0] NCCL INFO Channel 50/64 : 0
+lshn-qs-pjul-8:1217537:1219032 [0] NCCL INFO Channel 51/64 : 0
+lshn-qs-pjul-8:1217537:1219032 [0] NCCL INFO Channel 52/64 : 0
+lshn-qs-pjul-8:1217537:1219032 [0] NCCL INFO Channel 53/64 : 0
+lshn-qs-pjul-8:1217537:1219032 [0] NCCL INFO Channel 54/64 : 0
+lshn-qs-pjul-8:1217537:1219032 [0] NCCL INFO Channel 55/64 : 0
+lshn-qs-pjul-8:1217537:1219032 [0] NCCL INFO Channel 56/64 : 0
+lshn-qs-pjul-8:1217537:1219032 [0] NCCL INFO Channel 57/64 : 0
+lshn-qs-pjul-8:1217537:1219032 [0] NCCL INFO Channel 58/64 : 0
+lshn-qs-pjul-8:1217537:1219032 [0] NCCL INFO Channel 59/64 : 0
+lshn-qs-pjul-8:1217537:1219032 [0] NCCL INFO Channel 60/64 : 0
+lshn-qs-pjul-8:1217537:1219032 [0] NCCL INFO Channel 61/64 : 0
+lshn-qs-pjul-8:1217537:1219032 [0] NCCL INFO Channel 62/64 : 0
+lshn-qs-pjul-8:1217537:1219032 [0] NCCL INFO Channel 63/64 : 0
+lshn-qs-pjul-8:1217537:1219032 [0] NCCL INFO Trees [0] -1/-1/-1->0->-1 [1] -1/-1/-1->0->-1 [2] -1/-1/-1->0->-1 [3] -1/-1/-1->0->-1 [4] -1/-1/-1->0->-1 [5] -1/-1/-1->0->-1 [6] -1/-1/-1->0->-1 [7] -1/-1/-1->0->-1 [8] -1/-1/-1->0->-1 [9] -1/-1/-1->0->-1 [10] -1/-1/-1->0->-1 [11] -1/-1/-1->0->-1 [12] -1/-1/-1->0->-1 [13] -1/-1/-1->0->-1 [14] -1/-1/-1->0->-1 [15] -1/-1/-1->0->-1 [16] -1/-1/-1->0->-1 [17] -1/-1/-1->0->-1 [18] -1/-1/-1->0->-1 [19] -1/-1/-1->0->-1 [20] -1/-1/-1->0->-1 [21] -1/-1/-1->0->-1 [22] -1/-1/-1->0->-1 [23] -1/-1/-1->0->-1 [24] -1/-1/-1->0->-1 [25] -1/-1/-1->0->-1 [26] -1/-1/-1->0->-1 [27] -1/-1/-1->0->-1 [28] -1/-1/-1->0->-1 [29] -1/-1/-1->0->-1 [30] -1/-1/-1->0->-1 [31] -1/-1/-1->0->-1 [32] -1/-1/-1->0->-1 [33] -1/-1/-1->0->-1 [34] -1/-1/-1->0->-1 [35] -1/-1/-1->0->-1 [36] -1/-1/-1->0->-1 [37] -1/-1/-1->0->-1 [38] -1/-1/-1->0->-1 [39] -1/-1/-1->0->-1 [40] -1/-1/-1->0->-1 [41] -1/-1/-1->0->-1 [42] -1/-1/-1->0->-1 [43] -1/-1/-1->0->-1 [44] -1/-1/-1->0->-1 [45] -1/-1/-1->0->-1 [46] -1/-1/-1->0
+lshn-qs-pjul-8:1217537:1219032 [0] NCCL INFO P2P Chunksize set to 524288
+lshn-qs-pjul-8:1217537:1219032 [0] NCCL INFO Check P2P Type isAllDirectP2p 0 directMode 0
+lshn-qs-pjul-8:1217537:1219042 [0] NCCL INFO [Proxy Service] Device 0 CPU core 68
+lshn-qs-pjul-8:1217537:1219043 [0] NCCL INFO [Proxy Service UDS] Device 0 CPU core 177
+lshn-qs-pjul-8:1217537:1219032 [0] NCCL INFO 64 coll channels, 64 collnet channels, 0 nvls channels, 64 p2p channels, 64 p2p channels per peer
+lshn-qs-pjul-8:1217537:1219032 [0] NCCL INFO CC Off, workFifoBytes 1048576
+lshn-qs-pjul-8:1217537:1219032 [0] NCCL INFO ncclCommSplit comm 0x200d7a80 rank 0 nranks 1 cudaDev 0 nvmlDev 4 busId 109000 parent 0x1ce20510 splitCount 30 color 59908776 key 0 - Init COMPLETE
+lshn-qs-pjul-8:1217537:1219032 [0] NCCL INFO Init timings - ncclCommSplit: rank 0 nranks 1 total 0.17 (kernels 0.00, alloc 0.00, bootstrap 0.00, allgathers 0.02, topo 0.02, graphs 0.00, connections 0.08, rest 0.05)
+[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
+lshn-qs-pjul-8:1217537:1217537 [0] NCCL INFO Rank 0 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-pjul-8:1217537:1217537 [0] NCCL INFO Rank 0 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-pjul-8:1217539:1217539 [2] NCCL INFO Rank 2 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-pjul-8:1217540:1217540 [3] NCCL INFO Rank 3 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-pjul-8:1217538:1217538 [1] NCCL INFO Comm config Blocking set to 1
+lshn-qs-pjul-8:1217538:1219053 [1] NCCL INFO Assigned NET plugin Socket to comm
+lshn-qs-pjul-8:1217538:1219053 [1] NCCL INFO Using network Socket
+lshn-qs-pjul-8:1217539:1217539 [2] NCCL INFO Rank 2 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-pjul-8:1217540:1217540 [3] NCCL INFO Rank 3 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-pjul-8:1217537:1217537 [0] NCCL INFO Rank 0 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-pjul-8:1217538:1219053 [1] NCCL INFO ncclCommSplit comm 0x219977f0 rank 0 nranks 1 cudaDev 1 nvmlDev 5 busId 17f000 parent 0x1ed808d0 splitCount 32 color 440515407 key 0- Init START
+lshn-qs-pjul-8:1217538:1219053 [1] NCCL INFO MNNVL busId 0x17f000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0
+lshn-qs-pjul-8:1217538:1219053 [1] NCCL INFO Setting affinity for GPU 5 to 48-95,144-191
+lshn-qs-pjul-8:1217538:1219053 [1] NCCL INFO comm 0x219977f0 rank 0 nRanks 1 nNodes 1 localRanks 1 localRank 0 MNNVL 0
+lshn-qs-pjul-8:1217538:1219053 [1] NCCL INFO Channel 00/64 : 0
+lshn-qs-pjul-8:1217538:1219053 [1] NCCL INFO Channel 01/64 : 0
+lshn-qs-pjul-8:1217538:1219053 [1] NCCL INFO Channel 02/64 : 0
+lshn-qs-pjul-8:1217538:1219053 [1] NCCL INFO Channel 03/64 : 0
+lshn-qs-pjul-8:1217538:1219053 [1] NCCL INFO Channel 04/64 : 0
+lshn-qs-pjul-8:1217538:1219053 [1] NCCL INFO Channel 05/64 : 0
+lshn-qs-pjul-8:1217538:1219053 [1] NCCL INFO Channel 06/64 : 0
+lshn-qs-pjul-8:1217538:1219053 [1] NCCL INFO Channel 07/64 : 0
+lshn-qs-pjul-8:1217538:1219053 [1] NCCL INFO Channel 08/64 : 0
+lshn-qs-pjul-8:1217538:1219053 [1] NCCL INFO Channel 09/64 : 0
+lshn-qs-pjul-8:1217538:1219053 [1] NCCL INFO Channel 10/64 : 0
+lshn-qs-pjul-8:1217538:1219053 [1] NCCL INFO Channel 11/64 : 0
+lshn-qs-pjul-8:1217538:1219053 [1] NCCL INFO Channel 12/64 : 0
+lshn-qs-pjul-8:1217538:1219053 [1] NCCL INFO Channel 13/64 : 0
+lshn-qs-pjul-8:1217538:1219053 [1] NCCL INFO Channel 14/64 : 0
+lshn-qs-pjul-8:1217538:1219053 [1] NCCL INFO Channel 15/64 : 0
+lshn-qs-pjul-8:1217538:1219053 [1] NCCL INFO Channel 16/64 : 0
+lshn-qs-pjul-8:1217538:1219053 [1] NCCL INFO Channel 17/64 : 0
+lshn-qs-pjul-8:1217538:1219053 [1] NCCL INFO Channel 18/64 : 0
+lshn-qs-pjul-8:1217538:1219053 [1] NCCL INFO Channel 19/64 : 0
+lshn-qs-pjul-8:1217538:1219053 [1] NCCL INFO Channel 20/64 : 0
+lshn-qs-pjul-8:1217538:1219053 [1] NCCL INFO Channel 21/64 : 0
+lshn-qs-pjul-8:1217538:1219053 [1] NCCL INFO Channel 22/64 : 0
+lshn-qs-pjul-8:1217538:1219053 [1] NCCL INFO Channel 23/64 : 0
+lshn-qs-pjul-8:1217538:1219053 [1] NCCL INFO Channel 24/64 : 0
+lshn-qs-pjul-8:1217538:1219053 [1] NCCL INFO Channel 25/64 : 0
+lshn-qs-pjul-8:1217538:1219053 [1] NCCL INFO Channel 26/64 : 0
+lshn-qs-pjul-8:1217538:1219053 [1] NCCL INFO Channel 27/64 : 0
+lshn-qs-pjul-8:1217538:1219053 [1] NCCL INFO Channel 28/64 : 0
+lshn-qs-pjul-8:1217538:1219053 [1] NCCL INFO Channel 29/64 : 0
+lshn-qs-pjul-8:1217538:1219053 [1] NCCL INFO Channel 30/64 : 0
+lshn-qs-pjul-8:1217538:1219053 [1] NCCL INFO Channel 31/64 : 0
+lshn-qs-pjul-8:1217538:1219053 [1] NCCL INFO Channel 32/64 : 0
+lshn-qs-pjul-8:1217538:1219053 [1] NCCL INFO Channel 33/64 : 0
+lshn-qs-pjul-8:1217538:1219053 [1] NCCL INFO Channel 34/64 : 0
+lshn-qs-pjul-8:1217538:1219053 [1] NCCL INFO Channel 35/64 : 0
+lshn-qs-pjul-8:1217538:1219053 [1] NCCL INFO Channel 36/64 : 0
+lshn-qs-pjul-8:1217538:1219053 [1] NCCL INFO Channel 37/64 : 0
+lshn-qs-pjul-8:1217538:1219053 [1] NCCL INFO Channel 38/64 : 0
+lshn-qs-pjul-8:1217538:1219053 [1] NCCL INFO Channel 39/64 : 0
+lshn-qs-pjul-8:1217538:1219053 [1] NCCL INFO Channel 40/64 : 0
+lshn-qs-pjul-8:1217538:1219053 [1] NCCL INFO Channel 41/64 : 0
+lshn-qs-pjul-8:1217538:1219053 [1] NCCL INFO Channel 42/64 : 0
+lshn-qs-pjul-8:1217538:1219053 [1] NCCL INFO Channel 43/64 : 0
+lshn-qs-pjul-8:1217538:1219053 [1] NCCL INFO Channel 44/64 : 0
+lshn-qs-pjul-8:1217538:1219053 [1] NCCL INFO Channel 45/64 : 0
+lshn-qs-pjul-8:1217538:1219053 [1] NCCL INFO Channel 46/64 : 0
+lshn-qs-pjul-8:1217538:1219053 [1] NCCL INFO Channel 47/64 : 0
+lshn-qs-pjul-8:1217538:1219053 [1] NCCL INFO Channel 48/64 : 0
+lshn-qs-pjul-8:1217538:1219053 [1] NCCL INFO Channel 49/64 : 0
+lshn-qs-pjul-8:1217538:1219053 [1] NCCL INFO Channel 50/64 : 0
+lshn-qs-pjul-8:1217538:1219053 [1] NCCL INFO Channel 51/64 : 0
+lshn-qs-pjul-8:1217538:1219053 [1] NCCL INFO Channel 52/64 : 0
+lshn-qs-pjul-8:1217538:1219053 [1] NCCL INFO Channel 53/64 : 0
+lshn-qs-pjul-8:1217538:1219053 [1] NCCL INFO Channel 54/64 : 0
+lshn-qs-pjul-8:1217538:1219053 [1] NCCL INFO Channel 55/64 : 0
+lshn-qs-pjul-8:1217538:1219053 [1] NCCL INFO Channel 56/64 : 0
+lshn-qs-pjul-8:1217538:1219053 [1] NCCL INFO Channel 57/64 : 0
+lshn-qs-pjul-8:1217538:1219053 [1] NCCL INFO Channel 58/64 : 0
+lshn-qs-pjul-8:1217538:1219053 [1] NCCL INFO Channel 59/64 : 0
+lshn-qs-pjul-8:1217538:1219053 [1] NCCL INFO Channel 60/64 : 0
+lshn-qs-pjul-8:1217538:1219053 [1] NCCL INFO Channel 61/64 : 0
+lshn-qs-pjul-8:1217538:1219053 [1] NCCL INFO Channel 62/64 : 0
+lshn-qs-pjul-8:1217538:1219053 [1] NCCL INFO Channel 63/64 : 0
+lshn-qs-pjul-8:1217538:1219053 [1] NCCL INFO Trees [0] -1/-1/-1->0->-1 [1] -1/-1/-1->0->-1 [2] -1/-1/-1->0->-1 [3] -1/-1/-1->0->-1 [4] -1/-1/-1->0->-1 [5] -1/-1/-1->0->-1 [6] -1/-1/-1->0->-1 [7] -1/-1/-1->0->-1 [8] -1/-1/-1->0->-1 [9] -1/-1/-1->0->-1 [10] -1/-1/-1->0->-1 [11] -1/-1/-1->0->-1 [12] -1/-1/-1->0->-1 [13] -1/-1/-1->0->-1 [14] -1/-1/-1->0->-1 [15] -1/-1/-1->0->-1 [16] -1/-1/-1->0->-1 [17] -1/-1/-1->0->-1 [18] -1/-1/-1->0->-1 [19] -1/-1/-1->0->-1 [20] -1/-1/-1->0->-1 [21] -1/-1/-1->0->-1 [22] -1/-1/-1->0->-1 [23] -1/-1/-1->0->-1 [24] -1/-1/-1->0->-1 [25] -1/-1/-1->0->-1 [26] -1/-1/-1->0->-1 [27] -1/-1/-1->0->-1 [28] -1/-1/-1->0->-1 [29] -1/-1/-1->0->-1 [30] -1/-1/-1->0->-1 [31] -1/-1/-1->0->-1 [32] -1/-1/-1->0->-1 [33] -1/-1/-1->0->-1 [34] -1/-1/-1->0->-1 [35] -1/-1/-1->0->-1 [36] -1/-1/-1->0->-1 [37] -1/-1/-1->0->-1 [38] -1/-1/-1->0->-1 [39] -1/-1/-1->0->-1 [40] -1/-1/-1->0->-1 [41] -1/-1/-1->0->-1 [42] -1/-1/-1->0->-1 [43] -1/-1/-1->0->-1 [44] -1/-1/-1->0->-1 [45] -1/-1/-1->0->-1 [46] -1/-1/-1->0
+lshn-qs-pjul-8:1217538:1219053 [1] NCCL INFO P2P Chunksize set to 524288
+lshn-qs-pjul-8:1217538:1219053 [1] NCCL INFO Check P2P Type isAllDirectP2p 0 directMode 0
+lshn-qs-pjul-8:1217538:1219058 [1] NCCL INFO [Proxy Service UDS] Device 1 CPU core 55
+lshn-qs-pjul-8:1217538:1219057 [1] NCCL INFO [Proxy Service] Device 1 CPU core 89
+lshn-qs-pjul-8:1217538:1219053 [1] NCCL INFO 64 coll channels, 64 collnet channels, 0 nvls channels, 64 p2p channels, 64 p2p channels per peer
+lshn-qs-pjul-8:1217538:1219053 [1] NCCL INFO CC Off, workFifoBytes 1048576
+lshn-qs-pjul-8:1217538:1219053 [1] NCCL INFO ncclCommSplit comm 0x219977f0 rank 0 nranks 1 cudaDev 1 nvmlDev 5 busId 17f000 parent 0x1ed808d0 splitCount 32 color 440515407 key 0 - Init COMPLETE
+lshn-qs-pjul-8:1217538:1219053 [1] NCCL INFO Init timings - ncclCommSplit: rank 0 nranks 1 total 0.05 (kernels 0.00, alloc 0.00, bootstrap 0.00, allgathers 0.00, topo 0.02, graphs 0.00, connections 0.02, rest 0.01)
+[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
+lshn-qs-pjul-8:1217538:1217538 [1] NCCL INFO Rank 1 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-pjul-8:1217538:1217538 [1] NCCL INFO Rank 1 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-pjul-8:1217540:1217540 [3] NCCL INFO Rank 3 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-pjul-8:1217537:1217537 [0] NCCL INFO Rank 0 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-pjul-8:1217539:1217539 [2] NCCL INFO Comm config Blocking set to 1
+lshn-qs-pjul-8:1217539:1219068 [2] NCCL INFO Assigned NET plugin Socket to comm
+lshn-qs-pjul-8:1217539:1219068 [2] NCCL INFO Using network Socket
+lshn-qs-pjul-8:1217540:1217540 [3] NCCL INFO Rank 3 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-pjul-8:1217537:1217537 [0] NCCL INFO Rank 0 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-pjul-8:1217538:1217538 [1] NCCL INFO Rank 1 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-pjul-8:1217539:1219068 [2] NCCL INFO ncclCommSplit comm 0x20901720 rank 0 nranks 1 cudaDev 2 nvmlDev 6 busId 1a3000 parent 0x1dced410 splitCount 34 color 1227022723 key 0- Init START
+lshn-qs-pjul-8:1217539:1219068 [2] NCCL INFO MNNVL busId 0x1a3000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0
+lshn-qs-pjul-8:1217539:1219068 [2] NCCL INFO Setting affinity for GPU 6 to 48-95,144-191
+lshn-qs-pjul-8:1217539:1219068 [2] NCCL INFO comm 0x20901720 rank 0 nRanks 1 nNodes 1 localRanks 1 localRank 0 MNNVL 0
+lshn-qs-pjul-8:1217539:1219068 [2] NCCL INFO Channel 00/64 : 0
+lshn-qs-pjul-8:1217539:1219068 [2] NCCL INFO Channel 01/64 : 0
+lshn-qs-pjul-8:1217539:1219068 [2] NCCL INFO Channel 02/64 : 0
+lshn-qs-pjul-8:1217539:1219068 [2] NCCL INFO Channel 03/64 : 0
+lshn-qs-pjul-8:1217539:1219068 [2] NCCL INFO Channel 04/64 : 0
+lshn-qs-pjul-8:1217539:1219068 [2] NCCL INFO Channel 05/64 : 0
+lshn-qs-pjul-8:1217539:1219068 [2] NCCL INFO Channel 06/64 : 0
+lshn-qs-pjul-8:1217539:1219068 [2] NCCL INFO Channel 07/64 : 0
+lshn-qs-pjul-8:1217539:1219068 [2] NCCL INFO Channel 08/64 : 0
+lshn-qs-pjul-8:1217539:1219068 [2] NCCL INFO Channel 09/64 : 0
+lshn-qs-pjul-8:1217539:1219068 [2] NCCL INFO Channel 10/64 : 0
+lshn-qs-pjul-8:1217539:1219068 [2] NCCL INFO Channel 11/64 : 0
+lshn-qs-pjul-8:1217539:1219068 [2] NCCL INFO Channel 12/64 : 0
+lshn-qs-pjul-8:1217539:1219068 [2] NCCL INFO Channel 13/64 : 0
+lshn-qs-pjul-8:1217539:1219068 [2] NCCL INFO Channel 14/64 : 0
+lshn-qs-pjul-8:1217539:1219068 [2] NCCL INFO Channel 15/64 : 0
+lshn-qs-pjul-8:1217539:1219068 [2] NCCL INFO Channel 16/64 : 0
+lshn-qs-pjul-8:1217539:1219068 [2] NCCL INFO Channel 17/64 : 0
+lshn-qs-pjul-8:1217539:1219068 [2] NCCL INFO Channel 18/64 : 0
+lshn-qs-pjul-8:1217539:1219068 [2] NCCL INFO Channel 19/64 : 0
+lshn-qs-pjul-8:1217539:1219068 [2] NCCL INFO Channel 20/64 : 0
+lshn-qs-pjul-8:1217539:1219068 [2] NCCL INFO Channel 21/64 : 0
+lshn-qs-pjul-8:1217539:1219068 [2] NCCL INFO Channel 22/64 : 0
+lshn-qs-pjul-8:1217539:1219068 [2] NCCL INFO Channel 23/64 : 0
+lshn-qs-pjul-8:1217539:1219068 [2] NCCL INFO Channel 24/64 : 0
+lshn-qs-pjul-8:1217539:1219068 [2] NCCL INFO Channel 25/64 : 0
+lshn-qs-pjul-8:1217539:1219068 [2] NCCL INFO Channel 26/64 : 0
+lshn-qs-pjul-8:1217539:1219068 [2] NCCL INFO Channel 27/64 : 0
+lshn-qs-pjul-8:1217539:1219068 [2] NCCL INFO Channel 28/64 : 0
+lshn-qs-pjul-8:1217539:1219068 [2] NCCL INFO Channel 29/64 : 0
+lshn-qs-pjul-8:1217539:1219068 [2] NCCL INFO Channel 30/64 : 0
+lshn-qs-pjul-8:1217539:1219068 [2] NCCL INFO Channel 31/64 : 0
+lshn-qs-pjul-8:1217539:1219068 [2] NCCL INFO Channel 32/64 : 0
+lshn-qs-pjul-8:1217539:1219068 [2] NCCL INFO Channel 33/64 : 0
+lshn-qs-pjul-8:1217539:1219068 [2] NCCL INFO Channel 34/64 : 0
+lshn-qs-pjul-8:1217539:1219068 [2] NCCL INFO Channel 35/64 : 0
+lshn-qs-pjul-8:1217539:1219068 [2] NCCL INFO Channel 36/64 : 0
+lshn-qs-pjul-8:1217539:1219068 [2] NCCL INFO Channel 37/64 : 0
+lshn-qs-pjul-8:1217539:1219068 [2] NCCL INFO Channel 38/64 : 0
+lshn-qs-pjul-8:1217539:1219068 [2] NCCL INFO Channel 39/64 : 0
+lshn-qs-pjul-8:1217539:1219068 [2] NCCL INFO Channel 40/64 : 0
+lshn-qs-pjul-8:1217539:1219068 [2] NCCL INFO Channel 41/64 : 0
+lshn-qs-pjul-8:1217539:1219068 [2] NCCL INFO Channel 42/64 : 0
+lshn-qs-pjul-8:1217539:1219068 [2] NCCL INFO Channel 43/64 : 0
+lshn-qs-pjul-8:1217539:1219068 [2] NCCL INFO Channel 44/64 : 0
+lshn-qs-pjul-8:1217539:1219068 [2] NCCL INFO Channel 45/64 : 0
+lshn-qs-pjul-8:1217539:1219068 [2] NCCL INFO Channel 46/64 : 0
+lshn-qs-pjul-8:1217539:1219068 [2] NCCL INFO Channel 47/64 : 0
+lshn-qs-pjul-8:1217539:1219068 [2] NCCL INFO Channel 48/64 : 0
+lshn-qs-pjul-8:1217539:1219068 [2] NCCL INFO Channel 49/64 : 0
+lshn-qs-pjul-8:1217539:1219068 [2] NCCL INFO Channel 50/64 : 0
+lshn-qs-pjul-8:1217539:1219068 [2] NCCL INFO Channel 51/64 : 0
+lshn-qs-pjul-8:1217539:1219068 [2] NCCL INFO Channel 52/64 : 0
+lshn-qs-pjul-8:1217539:1219068 [2] NCCL INFO Channel 53/64 : 0
+lshn-qs-pjul-8:1217539:1219068 [2] NCCL INFO Channel 54/64 : 0
+lshn-qs-pjul-8:1217539:1219068 [2] NCCL INFO Channel 55/64 : 0
+lshn-qs-pjul-8:1217539:1219068 [2] NCCL INFO Channel 56/64 : 0
+lshn-qs-pjul-8:1217539:1219068 [2] NCCL INFO Channel 57/64 : 0
+lshn-qs-pjul-8:1217539:1219068 [2] NCCL INFO Channel 58/64 : 0
+lshn-qs-pjul-8:1217539:1219068 [2] NCCL INFO Channel 59/64 : 0
+lshn-qs-pjul-8:1217539:1219068 [2] NCCL INFO Channel 60/64 : 0
+lshn-qs-pjul-8:1217539:1219068 [2] NCCL INFO Channel 61/64 : 0
+lshn-qs-pjul-8:1217539:1219068 [2] NCCL INFO Channel 62/64 : 0
+lshn-qs-pjul-8:1217539:1219068 [2] NCCL INFO Channel 63/64 : 0
+lshn-qs-pjul-8:1217539:1219068 [2] NCCL INFO Trees [0] -1/-1/-1->0->-1 [1] -1/-1/-1->0->-1 [2] -1/-1/-1->0->-1 [3] -1/-1/-1->0->-1 [4] -1/-1/-1->0->-1 [5] -1/-1/-1->0->-1 [6] -1/-1/-1->0->-1 [7] -1/-1/-1->0->-1 [8] -1/-1/-1->0->-1 [9] -1/-1/-1->0->-1 [10] -1/-1/-1->0->-1 [11] -1/-1/-1->0->-1 [12] -1/-1/-1->0->-1 [13] -1/-1/-1->0->-1 [14] -1/-1/-1->0->-1 [15] -1/-1/-1->0->-1 [16] -1/-1/-1->0->-1 [17] -1/-1/-1->0->-1 [18] -1/-1/-1->0->-1 [19] -1/-1/-1->0->-1 [20] -1/-1/-1->0->-1 [21] -1/-1/-1->0->-1 [22] -1/-1/-1->0->-1 [23] -1/-1/-1->0->-1 [24] -1/-1/-1->0->-1 [25] -1/-1/-1->0->-1 [26] -1/-1/-1->0->-1 [27] -1/-1/-1->0->-1 [28] -1/-1/-1->0->-1 [29] -1/-1/-1->0->-1 [30] -1/-1/-1->0->-1 [31] -1/-1/-1->0->-1 [32] -1/-1/-1->0->-1 [33] -1/-1/-1->0->-1 [34] -1/-1/-1->0->-1 [35] -1/-1/-1->0->-1 [36] -1/-1/-1->0->-1 [37] -1/-1/-1->0->-1 [38] -1/-1/-1->0->-1 [39] -1/-1/-1->0->-1 [40] -1/-1/-1->0->-1 [41] -1/-1/-1->0->-1 [42] -1/-1/-1->0->-1 [43] -1/-1/-1->0->-1 [44] -1/-1/-1->0->-1 [45] -1/-1/-1->0->-1 [46] -1/-1/-1->0
+lshn-qs-pjul-8:1217539:1219068 [2] NCCL INFO P2P Chunksize set to 524288
+lshn-qs-pjul-8:1217539:1219068 [2] NCCL INFO Check P2P Type isAllDirectP2p 0 directMode 0
+lshn-qs-pjul-8:1217539:1219072 [2] NCCL INFO [Proxy Service] Device 2 CPU core 147
+lshn-qs-pjul-8:1217539:1219073 [2] NCCL INFO [Proxy Service UDS] Device 2 CPU core 85
+lshn-qs-pjul-8:1217539:1219068 [2] NCCL INFO 64 coll channels, 64 collnet channels, 0 nvls channels, 64 p2p channels, 64 p2p channels per peer
+lshn-qs-pjul-8:1217539:1219068 [2] NCCL INFO CC Off, workFifoBytes 1048576
+lshn-qs-pjul-8:1217539:1219068 [2] NCCL INFO ncclCommSplit comm 0x20901720 rank 0 nranks 1 cudaDev 2 nvmlDev 6 busId 1a3000 parent 0x1dced410 splitCount 34 color 1227022723 key 0 - Init COMPLETE
+lshn-qs-pjul-8:1217539:1219068 [2] NCCL INFO Init timings - ncclCommSplit: rank 0 nranks 1 total 0.03 (kernels 0.00, alloc 0.00, bootstrap 0.00, allgathers 0.00, topo 0.01, graphs 0.00, connections 0.02, rest 0.00)
+[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
+lshn-qs-pjul-8:1217539:1217539 [2] NCCL INFO Rank 2 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-pjul-8:1217539:1217539 [2] NCCL INFO Rank 2 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-pjul-8:1217538:1217538 [1] NCCL INFO Rank 1 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-pjul-8:1217537:1217537 [0] NCCL INFO Rank 0 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-pjul-8:1217540:1217540 [3] NCCL INFO Comm config Blocking set to 1
+lshn-qs-pjul-8:1217540:1219083 [3] NCCL INFO Assigned NET plugin Socket to comm
+lshn-qs-pjul-8:1217540:1219083 [3] NCCL INFO Using network Socket
+INFO 12-02 17:35:12 [parallel_state.py:1165] rank 1 in world size 4 is assigned as DP rank 0, PP rank 0, TP rank 0, EP rank 0
+INFO 12-02 17:35:12 [parallel_state.py:1165] rank 0 in world size 4 is assigned as DP rank 0, PP rank 0, TP rank 0, EP rank 0
+INFO 12-02 17:35:12 [parallel_state.py:1165] rank 2 in world size 4 is assigned as DP rank 0, PP rank 0, TP rank 0, EP rank 0
+lshn-qs-pjul-8:1217540:1219083 [3] NCCL INFO ncclCommSplit comm 0x21d3cbe0 rank 0 nranks 1 cudaDev 3 nvmlDev 7 busId 1c7000 parent 0x1f12a6e0 splitCount 36 color 1301067556 key 0- Init START
+lshn-qs-pjul-8:1217540:1219083 [3] NCCL INFO MNNVL busId 0x1c7000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0
+lshn-qs-pjul-8:1217540:1219083 [3] NCCL INFO Setting affinity for GPU 7 to 48-95,144-191
+lshn-qs-pjul-8:1217540:1219083 [3] NCCL INFO comm 0x21d3cbe0 rank 0 nRanks 1 nNodes 1 localRanks 1 localRank 0 MNNVL 0
+lshn-qs-pjul-8:1217540:1219083 [3] NCCL INFO Channel 00/64 : 0
+lshn-qs-pjul-8:1217540:1219083 [3] NCCL INFO Channel 01/64 : 0
+lshn-qs-pjul-8:1217540:1219083 [3] NCCL INFO Channel 02/64 : 0
+lshn-qs-pjul-8:1217540:1219083 [3] NCCL INFO Channel 03/64 : 0
+lshn-qs-pjul-8:1217540:1219083 [3] NCCL INFO Channel 04/64 : 0
+lshn-qs-pjul-8:1217540:1219083 [3] NCCL INFO Channel 05/64 : 0
+lshn-qs-pjul-8:1217540:1219083 [3] NCCL INFO Channel 06/64 : 0
+lshn-qs-pjul-8:1217540:1219083 [3] NCCL INFO Channel 07/64 : 0
+lshn-qs-pjul-8:1217540:1219083 [3] NCCL INFO Channel 08/64 : 0
+lshn-qs-pjul-8:1217540:1219083 [3] NCCL INFO Channel 09/64 : 0
+lshn-qs-pjul-8:1217540:1219083 [3] NCCL INFO Channel 10/64 : 0
+lshn-qs-pjul-8:1217540:1219083 [3] NCCL INFO Channel 11/64 : 0
+lshn-qs-pjul-8:1217540:1219083 [3] NCCL INFO Channel 12/64 : 0
+lshn-qs-pjul-8:1217540:1219083 [3] NCCL INFO Channel 13/64 : 0
+lshn-qs-pjul-8:1217540:1219083 [3] NCCL INFO Channel 14/64 : 0
+lshn-qs-pjul-8:1217540:1219083 [3] NCCL INFO Channel 15/64 : 0
+lshn-qs-pjul-8:1217540:1219083 [3] NCCL INFO Channel 16/64 : 0
+lshn-qs-pjul-8:1217540:1219083 [3] NCCL INFO Channel 17/64 : 0
+lshn-qs-pjul-8:1217540:1219083 [3] NCCL INFO Channel 18/64 : 0
+lshn-qs-pjul-8:1217540:1219083 [3] NCCL INFO Channel 19/64 : 0
+lshn-qs-pjul-8:1217540:1219083 [3] NCCL INFO Channel 20/64 : 0
+lshn-qs-pjul-8:1217540:1219083 [3] NCCL INFO Channel 21/64 : 0
+lshn-qs-pjul-8:1217540:1219083 [3] NCCL INFO Channel 22/64 : 0
+lshn-qs-pjul-8:1217540:1219083 [3] NCCL INFO Channel 23/64 : 0
+lshn-qs-pjul-8:1217540:1219083 [3] NCCL INFO Channel 24/64 : 0
+lshn-qs-pjul-8:1217540:1219083 [3] NCCL INFO Channel 25/64 : 0
+lshn-qs-pjul-8:1217540:1219083 [3] NCCL INFO Channel 26/64 : 0
+lshn-qs-pjul-8:1217540:1219083 [3] NCCL INFO Channel 27/64 : 0
+lshn-qs-pjul-8:1217540:1219083 [3] NCCL INFO Channel 28/64 : 0
+lshn-qs-pjul-8:1217540:1219083 [3] NCCL INFO Channel 29/64 : 0
+lshn-qs-pjul-8:1217540:1219083 [3] NCCL INFO Channel 30/64 : 0
+lshn-qs-pjul-8:1217540:1219083 [3] NCCL INFO Channel 31/64 : 0
+lshn-qs-pjul-8:1217540:1219083 [3] NCCL INFO Channel 32/64 : 0
+lshn-qs-pjul-8:1217540:1219083 [3] NCCL INFO Channel 33/64 : 0
+lshn-qs-pjul-8:1217540:1219083 [3] NCCL INFO Channel 34/64 : 0
+lshn-qs-pjul-8:1217540:1219083 [3] NCCL INFO Channel 35/64 : 0
+lshn-qs-pjul-8:1217540:1219083 [3] NCCL INFO Channel 36/64 : 0
+lshn-qs-pjul-8:1217540:1219083 [3] NCCL INFO Channel 37/64 : 0
+lshn-qs-pjul-8:1217540:1219083 [3] NCCL INFO Channel 38/64 : 0
+lshn-qs-pjul-8:1217540:1219083 [3] NCCL INFO Channel 39/64 : 0
+lshn-qs-pjul-8:1217540:1219083 [3] NCCL INFO Channel 40/64 : 0
+lshn-qs-pjul-8:1217540:1219083 [3] NCCL INFO Channel 41/64 : 0
+lshn-qs-pjul-8:1217540:1219083 [3] NCCL INFO Channel 42/64 : 0
+lshn-qs-pjul-8:1217540:1219083 [3] NCCL INFO Channel 43/64 : 0
+lshn-qs-pjul-8:1217540:1219083 [3] NCCL INFO Channel 44/64 : 0
+lshn-qs-pjul-8:1217540:1219083 [3] NCCL INFO Channel 45/64 : 0
+lshn-qs-pjul-8:1217540:1219083 [3] NCCL INFO Channel 46/64 : 0
+lshn-qs-pjul-8:1217540:1219083 [3] NCCL INFO Channel 47/64 : 0
+lshn-qs-pjul-8:1217540:1219083 [3] NCCL INFO Channel 48/64 : 0
+lshn-qs-pjul-8:1217540:1219083 [3] NCCL INFO Channel 49/64 : 0
+lshn-qs-pjul-8:1217540:1219083 [3] NCCL INFO Channel 50/64 : 0
+lshn-qs-pjul-8:1217540:1219083 [3] NCCL INFO Channel 51/64 : 0
+lshn-qs-pjul-8:1217540:1219083 [3] NCCL INFO Channel 52/64 : 0
+lshn-qs-pjul-8:1217540:1219083 [3] NCCL INFO Channel 53/64 : 0
+lshn-qs-pjul-8:1217540:1219083 [3] NCCL INFO Channel 54/64 : 0
+lshn-qs-pjul-8:1217540:1219083 [3] NCCL INFO Channel 55/64 : 0
+lshn-qs-pjul-8:1217540:1219083 [3] NCCL INFO Channel 56/64 : 0
+lshn-qs-pjul-8:1217540:1219083 [3] NCCL INFO Channel 57/64 : 0
+lshn-qs-pjul-8:1217540:1219083 [3] NCCL INFO Channel 58/64 : 0
+lshn-qs-pjul-8:1217540:1219083 [3] NCCL INFO Channel 59/64 : 0
+lshn-qs-pjul-8:1217540:1219083 [3] NCCL INFO Channel 60/64 : 0
+lshn-qs-pjul-8:1217540:1219083 [3] NCCL INFO Channel 61/64 : 0
+lshn-qs-pjul-8:1217540:1219083 [3] NCCL INFO Channel 62/64 : 0
+lshn-qs-pjul-8:1217540:1219083 [3] NCCL INFO Channel 63/64 : 0
+lshn-qs-pjul-8:1217540:1219083 [3] NCCL INFO Trees [0] -1/-1/-1->0->-1 [1] -1/-1/-1->0->-1 [2] -1/-1/-1->0->-1 [3] -1/-1/-1->0->-1 [4] -1/-1/-1->0->-1 [5] -1/-1/-1->0->-1 [6] -1/-1/-1->0->-1 [7] -1/-1/-1->0->-1 [8] -1/-1/-1->0->-1 [9] -1/-1/-1->0->-1 [10] -1/-1/-1->0->-1 [11] -1/-1/-1->0->-1 [12] -1/-1/-1->0->-1 [13] -1/-1/-1->0->-1 [14] -1/-1/-1->0->-1 [15] -1/-1/-1->0->-1 [16] -1/-1/-1->0->-1 [17] -1/-1/-1->0->-1 [18] -1/-1/-1->0->-1 [19] -1/-1/-1->0->-1 [20] -1/-1/-1->0->-1 [21] -1/-1/-1->0->-1 [22] -1/-1/-1->0->-1 [23] -1/-1/-1->0->-1 [24] -1/-1/-1->0->-1 [25] -1/-1/-1->0->-1 [26] -1/-1/-1->0->-1 [27] -1/-1/-1->0->-1 [28] -1/-1/-1->0->-1 [29] -1/-1/-1->0->-1 [30] -1/-1/-1->0->-1 [31] -1/-1/-1->0->-1 [32] -1/-1/-1->0->-1 [33] -1/-1/-1->0->-1 [34] -1/-1/-1->0->-1 [35] -1/-1/-1->0->-1 [36] -1/-1/-1->0->-1 [37] -1/-1/-1->0->-1 [38] -1/-1/-1->0->-1 [39] -1/-1/-1->0->-1 [40] -1/-1/-1->0->-1 [41] -1/-1/-1->0->-1 [42] -1/-1/-1->0->-1 [43] -1/-1/-1->0->-1 [44] -1/-1/-1->0->-1 [45] -1/-1/-1->0->-1 [46] -1/-1/-1->0
+lshn-qs-pjul-8:1217540:1219083 [3] NCCL INFO P2P Chunksize set to 524288
+lshn-qs-pjul-8:1217540:1219083 [3] NCCL INFO Check P2P Type isAllDirectP2p 0 directMode 0
+lshn-qs-pjul-8:1217540:1219084 [3] NCCL INFO [Proxy Service] Device 3 CPU core 83
+lshn-qs-pjul-8:1217540:1219085 [3] NCCL INFO [Proxy Service UDS] Device 3 CPU core 189
+lshn-qs-pjul-8:1217540:1219083 [3] NCCL INFO 64 coll channels, 64 collnet channels, 0 nvls channels, 64 p2p channels, 64 p2p channels per peer
+lshn-qs-pjul-8:1217540:1219083 [3] NCCL INFO CC Off, workFifoBytes 1048576
+lshn-qs-pjul-8:1217540:1219083 [3] NCCL INFO ncclCommSplit comm 0x21d3cbe0 rank 0 nranks 1 cudaDev 3 nvmlDev 7 busId 1c7000 parent 0x1f12a6e0 splitCount 36 color 1301067556 key 0 - Init COMPLETE
+lshn-qs-pjul-8:1217540:1219083 [3] NCCL INFO Init timings - ncclCommSplit: rank 0 nranks 1 total 0.03 (kernels 0.00, alloc 0.00, bootstrap 0.00, allgathers 0.00, topo 0.01, graphs 0.00, connections 0.02, rest 0.00)
+[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
+INFO 12-02 17:35:12 [parallel_state.py:1165] rank 3 in world size 4 is assigned as DP rank 0, PP rank 0, TP rank 0, EP rank 0
+INFO 12-02 17:35:13 [gpu_model_runner.py:2338] Starting to load model deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B...
+INFO 12-02 17:35:13 [gpu_model_runner.py:2338] Starting to load model deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B...
+INFO 12-02 17:35:13 [gpu_model_runner.py:2338] Starting to load model deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B...
+INFO 12-02 17:35:13 [gpu_model_runner.py:2338] Starting to load model deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B...
+INFO 12-02 17:35:13 [gpu_model_runner.py:2370] Loading model from scratch...
+INFO 12-02 17:35:13 [gpu_model_runner.py:2370] Loading model from scratch...
+INFO 12-02 17:35:13 [cuda.py:362] Using Flash Attention backend on V1 engine.
+INFO 12-02 17:35:13 [gpu_model_runner.py:2370] Loading model from scratch...
+INFO 12-02 17:35:13 [gpu_model_runner.py:2370] Loading model from scratch...
+INFO 12-02 17:35:13 [cuda.py:362] Using Flash Attention backend on V1 engine.
+INFO 12-02 17:35:13 [cuda.py:362] Using Flash Attention backend on V1 engine.
+INFO 12-02 17:35:13 [cuda.py:362] Using Flash Attention backend on V1 engine.
+INFO 12-02 17:35:14 [weight_utils.py:348] Using model weights format ['*.safetensors']
+INFO 12-02 17:35:14 [weight_utils.py:348] Using model weights format ['*.safetensors']
+INFO 12-02 17:35:14 [weight_utils.py:348] Using model weights format ['*.safetensors']
+INFO 12-02 17:35:14 [weight_utils.py:348] Using model weights format ['*.safetensors']
+INFO 12-02 17:35:14 [weight_utils.py:406] No model.safetensors.index.json found in remote.
+INFO 12-02 17:35:15 [weight_utils.py:406] No model.safetensors.index.json found in remote.
+
+Loading safetensors checkpoint shards:   0% Completed | 0/1 [00:00<?, ?it/s]
+[AINFO 12-02 17:35:16 [weight_utils.py:369] Time spent downloading weights for deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B: 1.021682 seconds
+INFO 12-02 17:35:16 [weight_utils.py:406] No model.safetensors.index.json found in remote.
+INFO 12-02 17:35:17 [weight_utils.py:406] No model.safetensors.index.json found in remote.
+
+Loading safetensors checkpoint shards: 100% Completed | 1/1 [00:02<00:00,  2.80s/it]
+[ALoading safetensors checkpoint shards: 100% Completed | 1/1 [00:02<00:00,  2.80s/it]
+
+INFO 12-02 17:35:18 [default_loader.py:268] Loading weights took 1.58 seconds
+INFO 12-02 17:35:18 [default_loader.py:268] Loading weights took 1.25 seconds
+INFO 12-02 17:35:18 [default_loader.py:268] Loading weights took 3.21 seconds
+INFO 12-02 17:35:18 [default_loader.py:268] Loading weights took 3.80 seconds
+INFO 12-02 17:35:18 [gpu_model_runner.py:2392] Model loading took 3.3461 GiB and 4.735971 seconds
+INFO 12-02 17:35:18 [gpu_model_runner.py:2392] Model loading took 3.3461 GiB and 5.083579 seconds
+INFO 12-02 17:35:19 [gpu_model_runner.py:2392] Model loading took 3.3461 GiB and 5.064209 seconds
+INFO 12-02 17:35:19 [gpu_model_runner.py:2392] Model loading took 3.3461 GiB and 5.147085 seconds
+INFO 12-02 17:35:24 [backends.py:539] Using cache directory: /mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/.cache/vllm/torch_compile_cache/63109e049c/rank_2_0/backbone for vLLM's torch.compile
+INFO 12-02 17:35:24 [backends.py:539] Using cache directory: /mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/.cache/vllm/torch_compile_cache/63109e049c/rank_1_0/backbone for vLLM's torch.compile
+INFO 12-02 17:35:24 [backends.py:550] Dynamo bytecode transform time: 5.90 s
+INFO 12-02 17:35:24 [backends.py:550] Dynamo bytecode transform time: 5.71 s
+INFO 12-02 17:35:25 [backends.py:539] Using cache directory: /mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/.cache/vllm/torch_compile_cache/63109e049c/rank_3_0/backbone for vLLM's torch.compile
+INFO 12-02 17:35:25 [backends.py:550] Dynamo bytecode transform time: 5.73 s
+INFO 12-02 17:35:25 [backends.py:539] Using cache directory: /mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/.cache/vllm/torch_compile_cache/63109e049c/rank_0_0/backbone for vLLM's torch.compile
+INFO 12-02 17:35:25 [backends.py:550] Dynamo bytecode transform time: 5.76 s
+INFO 12-02 17:35:28 [backends.py:161] Directly load the compiled graph(s) for dynamic shape from the cache, took 2.940 s
+INFO 12-02 17:35:28 [backends.py:161] Directly load the compiled graph(s) for dynamic shape from the cache, took 2.951 s
+INFO 12-02 17:35:28 [backends.py:161] Directly load the compiled graph(s) for dynamic shape from the cache, took 3.018 s
+INFO 12-02 17:35:28 [backends.py:161] Directly load the compiled graph(s) for dynamic shape from the cache, took 3.040 s
+INFO 12-02 17:35:28 [monitor.py:34] torch.compile takes 5.71 s in total
+INFO 12-02 17:35:28 [monitor.py:34] torch.compile takes 5.90 s in total
+INFO 12-02 17:35:28 [monitor.py:34] torch.compile takes 5.73 s in total
+INFO 12-02 17:35:28 [monitor.py:34] torch.compile takes 5.76 s in total
+INFO 12-02 17:35:29 [gpu_worker.py:298] Available KV cache memory: 50.14 GiB
+INFO 12-02 17:35:29 [gpu_worker.py:298] Available KV cache memory: 50.14 GiB
+INFO 12-02 17:35:29 [gpu_worker.py:298] Available KV cache memory: 50.14 GiB
+INFO 12-02 17:35:29 [gpu_worker.py:298] Available KV cache memory: 50.14 GiB
+INFO 12-02 17:35:30 [kv_cache_utils.py:864] GPU KV cache size: 1,877,584 tokens
+INFO 12-02 17:35:30 [kv_cache_utils.py:864] GPU KV cache size: 1,877,584 tokens
+INFO 12-02 17:35:30 [kv_cache_utils.py:864] GPU KV cache size: 1,877,584 tokens
+INFO 12-02 17:35:30 [kv_cache_utils.py:864] GPU KV cache size: 1,877,584 tokens
+INFO 12-02 17:35:30 [kv_cache_utils.py:868] Maximum concurrency for 16,896 tokens per request: 111.13x
+INFO 12-02 17:35:30 [kv_cache_utils.py:868] Maximum concurrency for 16,896 tokens per request: 111.13x
+INFO 12-02 17:35:30 [kv_cache_utils.py:868] Maximum concurrency for 16,896 tokens per request: 111.13x
+INFO 12-02 17:35:30 [kv_cache_utils.py:868] Maximum concurrency for 16,896 tokens per request: 111.13x
+
+Capturing CUDA graphs (mixed prefill-decode, PIECEWISE):   0%|          | 0/11 [00:00<?, ?it/s][A
+Capturing CUDA graphs (mixed prefill-decode, PIECEWISE):  27%|██▋       | 3/11 [00:00<00:00, 28.93it/s][A
+Capturing CUDA graphs (mixed prefill-decode, PIECEWISE):  64%|██████▎   | 7/11 [00:00<00:00, 32.16it/s][A
+Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 100%|██████████| 11/11 [00:00<00:00, 33.07it/s][ACapturing CUDA graphs (mixed prefill-decode, PIECEWISE): 100%|██████████| 11/11 [00:00<00:00, 32.46it/s]
+INFO 12-02 17:35:31 [gpu_model_runner.py:3118] Graph capturing finished in 1 secs, took 0.16 GiB
+INFO 12-02 17:35:31 [gpu_worker.py:391] Free memory on device (139.08/139.81 GiB) on startup. Desired GPU memory utilization is (0.4, 55.92 GiB). Actual usage is 3.35 GiB for weight, 0.28 GiB for peak activation, 2.16 GiB for non-torch memory, and 0.16 GiB for CUDAGraph memory. Replace gpu_memory_utilization config with `--kv-cache-memory=53509203353` to fit into requested memory, or `--kv-cache-memory=142796721152` to fully utilize gpu memory. Current kv cache memory in use is 53834261913 bytes.
+INFO 12-02 17:35:31 [gpu_model_runner.py:3118] Graph capturing finished in 1 secs, took 0.16 GiB
+INFO 12-02 17:35:31 [gpu_worker.py:391] Free memory on device (139.08/139.81 GiB) on startup. Desired GPU memory utilization is (0.4, 55.92 GiB). Actual usage is 3.35 GiB for weight, 0.28 GiB for peak activation, 2.16 GiB for non-torch memory, and 0.16 GiB for CUDAGraph memory. Replace gpu_memory_utilization config with `--kv-cache-memory=53509203353` to fit into requested memory, or `--kv-cache-memory=142796721152` to fully utilize gpu memory. Current kv cache memory in use is 53834261913 bytes.
+INFO 12-02 17:35:31 [gpu_model_runner.py:3118] Graph capturing finished in 1 secs, took 0.16 GiB
+INFO 12-02 17:35:31 [gpu_worker.py:391] Free memory on device (139.08/139.81 GiB) on startup. Desired GPU memory utilization is (0.4, 55.92 GiB). Actual usage is 3.35 GiB for weight, 0.28 GiB for peak activation, 2.16 GiB for non-torch memory, and 0.16 GiB for CUDAGraph memory. Replace gpu_memory_utilization config with `--kv-cache-memory=53509203353` to fit into requested memory, or `--kv-cache-memory=142796721152` to fully utilize gpu memory. Current kv cache memory in use is 53834261913 bytes.
+INFO 12-02 17:35:31 [core.py:218] init engine (profile, create kv cache, warmup model) took 12.20 seconds
+INFO 12-02 17:35:31 [core.py:218] init engine (profile, create kv cache, warmup model) took 12.37 seconds
+INFO 12-02 17:35:31 [core.py:218] init engine (profile, create kv cache, warmup model) took 12.62 seconds
+INFO 12-02 17:35:31 [gpu_model_runner.py:3118] Graph capturing finished in 1 secs, took 0.16 GiB
+INFO 12-02 17:35:31 [gpu_worker.py:391] Free memory on device (139.08/139.81 GiB) on startup. Desired GPU memory utilization is (0.4, 55.92 GiB). Actual usage is 3.35 GiB for weight, 0.28 GiB for peak activation, 2.16 GiB for non-torch memory, and 0.16 GiB for CUDAGraph memory. Replace gpu_memory_utilization config with `--kv-cache-memory=53509203353` to fit into requested memory, or `--kv-cache-memory=142796721152` to fully utilize gpu memory. Current kv cache memory in use is 53834261913 bytes.
+INFO 12-02 17:35:31 [core.py:218] init engine (profile, create kv cache, warmup model) took 12.31 seconds
+INFO 12-02 17:35:32 [llm.py:295] Supported_tasks: ('generate',)
+INFO 12-02 17:35:32 [__init__.py:36] No IOProcessor plugins requested by the model
+lshn-qs-pjul-8:1217537:1219208 [0] NCCL INFO Channel 00/0 : 0[4] -> 1[5] via P2P/CUMEM
+lshn-qs-pjul-8:1217537:1219208 [0] NCCL INFO Channel 01/0 : 0[4] -> 1[5] via P2P/CUMEM
+lshn-qs-pjul-8:1217537:1219208 [0] NCCL INFO Channel 02/0 : 0[4] -> 1[5] via P2P/CUMEM
+lshn-qs-pjul-8:1217537:1219208 [0] NCCL INFO Channel 03/0 : 0[4] -> 1[5] via P2P/CUMEM
+lshn-qs-pjul-8:1217537:1219208 [0] NCCL INFO Channel 04/0 : 0[4] -> 1[5] via P2P/CUMEM
+lshn-qs-pjul-8:1217537:1219208 [0] NCCL INFO Channel 05/0 : 0[4] -> 1[5] via P2P/CUMEM
+lshn-qs-pjul-8:1217537:1219208 [0] NCCL INFO Channel 06/0 : 0[4] -> 1[5] via P2P/CUMEM
+lshn-qs-pjul-8:1217537:1219208 [0] NCCL INFO Channel 07/0 : 0[4] -> 1[5] via P2P/CUMEM
+lshn-qs-pjul-8:1217537:1219208 [0] NCCL INFO Channel 08/0 : 0[4] -> 1[5] via P2P/CUMEM
+lshn-qs-pjul-8:1217537:1219208 [0] NCCL INFO Channel 09/0 : 0[4] -> 1[5] via P2P/CUMEM
+lshn-qs-pjul-8:1217537:1219208 [0] NCCL INFO Channel 10/0 : 0[4] -> 1[5] via P2P/CUMEM
+lshn-qs-pjul-8:1217537:1219208 [0] NCCL INFO Channel 11/0 : 0[4] -> 1[5] via P2P/CUMEM
+lshn-qs-pjul-8:1217537:1219208 [0] NCCL INFO Channel 12/0 : 0[4] -> 1[5] via P2P/CUMEM
+lshn-qs-pjul-8:1217537:1219208 [0] NCCL INFO Channel 13/0 : 0[4] -> 1[5] via P2P/CUMEM
+INFO 12-02 17:35:32 [llm.py:295] Supported_tasks: ('generate',)
+lshn-qs-pjul-8:1217537:1219208 [0] NCCL INFO Channel 14/0 : 0[4] -> 1[5] via P2P/CUMEM
+lshn-qs-pjul-8:1217537:1219208 [0] NCCL INFO Channel 15/0 : 0[4] -> 1[5] via P2P/CUMEM
+INFO 12-02 17:35:32 [__init__.py:36] No IOProcessor plugins requested by the model
+lshn-qs-pjul-8:1217537:1219208 [0] NCCL INFO Channel 16/0 : 0[4] -> 1[5] via P2P/CUMEM
+lshn-qs-pjul-8:1217537:1219208 [0] NCCL INFO Channel 17/0 : 0[4] -> 1[5] via P2P/CUMEM
+lshn-qs-pjul-8:1217537:1219208 [0] NCCL INFO Channel 18/0 : 0[4] -> 1[5] via P2P/CUMEM
+lshn-qs-pjul-8:1217537:1219208 [0] NCCL INFO Channel 19/0 : 0[4] -> 1[5] via P2P/CUMEM
+lshn-qs-pjul-8:1217537:1219208 [0] NCCL INFO Channel 20/0 : 0[4] -> 1[5] via P2P/CUMEM
+lshn-qs-pjul-8:1217537:1219208 [0] NCCL INFO Channel 21/0 : 0[4] -> 1[5] via P2P/CUMEM
+lshn-qs-pjul-8:1217537:1219208 [0] NCCL INFO Channel 22/0 : 0[4] -> 1[5] via P2P/CUMEM
+lshn-qs-pjul-8:1217537:1219208 [0] NCCL INFO Channel 23/0 : 0[4] -> 1[5] via P2P/CUMEM
+lshn-qs-pjul-8:1217538:1219209 [1] NCCL INFO Channel 00/0 : 1[5] -> 2[6] via P2P/CUMEM
+lshn-qs-pjul-8:1217538:1219209 [1] NCCL INFO Channel 01/0 : 1[5] -> 2[6] via P2P/CUMEM
+lshn-qs-pjul-8:1217538:1219209 [1] NCCL INFO Channel 02/0 : 1[5] -> 2[6] via P2P/CUMEM
+lshn-qs-pjul-8:1217538:1219209 [1] NCCL INFO Channel 03/0 : 1[5] -> 2[6] via P2P/CUMEM
+lshn-qs-pjul-8:1217538:1219209 [1] NCCL INFO Channel 04/0 : 1[5] -> 2[6] via P2P/CUMEM
+lshn-qs-pjul-8:1217538:1219209 [1] NCCL INFO Channel 05/0 : 1[5] -> 2[6] via P2P/CUMEM
+lshn-qs-pjul-8:1217538:1219209 [1] NCCL INFO Channel 06/0 : 1[5] -> 2[6] via P2P/CUMEM
+lshn-qs-pjul-8:1217538:1219209 [1] NCCL INFO Channel 07/0 : 1[5] -> 2[6] via P2P/CUMEM
+lshn-qs-pjul-8:1217538:1219209 [1] NCCL INFO Channel 08/0 : 1[5] -> 2[6] via P2P/CUMEM
+lshn-qs-pjul-8:1217538:1219209 [1] NCCL INFO Channel 09/0 : 1[5] -> 2[6] via P2P/CUMEM
+lshn-qs-pjul-8:1217538:1219209 [1] NCCL INFO Channel 10/0 : 1[5] -> 2[6] via P2P/CUMEM
+lshn-qs-pjul-8:1217538:1219209 [1] NCCL INFO Channel 11/0 : 1[5] -> 2[6] via P2P/CUMEM
+lshn-qs-pjul-8:1217538:1219209 [1] NCCL INFO Channel 12/0 : 1[5] -> 2[6] via P2P/CUMEM
+lshn-qs-pjul-8:1217538:1219209 [1] NCCL INFO Channel 13/0 : 1[5] -> 2[6] via P2P/CUMEM
+lshn-qs-pjul-8:1217538:1219209 [1] NCCL INFO Channel 14/0 : 1[5] -> 2[6] via P2P/CUMEM
+lshn-qs-pjul-8:1217538:1219209 [1] NCCL INFO Channel 15/0 : 1[5] -> 2[6] via P2P/CUMEM
+lshn-qs-pjul-8:1217538:1219209 [1] NCCL INFO Channel 16/0 : 1[5] -> 2[6] via P2P/CUMEM
+lshn-qs-pjul-8:1217538:1219209 [1] NCCL INFO Channel 17/0 : 1[5] -> 2[6] via P2P/CUMEM
+lshn-qs-pjul-8:1217538:1219209 [1] NCCL INFO Channel 18/0 : 1[5] -> 2[6] via P2P/CUMEM
+lshn-qs-pjul-8:1217538:1219209 [1] NCCL INFO Channel 19/0 : 1[5] -> 2[6] via P2P/CUMEM
+lshn-qs-pjul-8:1217538:1219209 [1] NCCL INFO Channel 20/0 : 1[5] -> 2[6] via P2P/CUMEM
+lshn-qs-pjul-8:1217538:1219209 [1] NCCL INFO Channel 21/0 : 1[5] -> 2[6] via P2P/CUMEM
+lshn-qs-pjul-8:1217538:1219209 [1] NCCL INFO Channel 22/0 : 1[5] -> 2[6] via P2P/CUMEM
+lshn-qs-pjul-8:1217538:1219209 [1] NCCL INFO Channel 23/0 : 1[5] -> 2[6] via P2P/CUMEM
+INFO 12-02 17:35:32 [llm.py:295] Supported_tasks: ('generate',)
+INFO 12-02 17:35:32 [__init__.py:36] No IOProcessor plugins requested by the model
+lshn-qs-pjul-8:1217539:1219210 [2] NCCL INFO Channel 00/0 : 2[6] -> 3[7] via P2P/CUMEM
+lshn-qs-pjul-8:1217539:1219210 [2] NCCL INFO Channel 01/0 : 2[6] -> 3[7] via P2P/CUMEM
+lshn-qs-pjul-8:1217539:1219210 [2] NCCL INFO Channel 02/0 : 2[6] -> 3[7] via P2P/CUMEM
+lshn-qs-pjul-8:1217539:1219210 [2] NCCL INFO Channel 03/0 : 2[6] -> 3[7] via P2P/CUMEM
+lshn-qs-pjul-8:1217539:1219210 [2] NCCL INFO Channel 04/0 : 2[6] -> 3[7] via P2P/CUMEM
+lshn-qs-pjul-8:1217539:1219210 [2] NCCL INFO Channel 05/0 : 2[6] -> 3[7] via P2P/CUMEM
+lshn-qs-pjul-8:1217539:1219210 [2] NCCL INFO Channel 06/0 : 2[6] -> 3[7] via P2P/CUMEM
+lshn-qs-pjul-8:1217539:1219210 [2] NCCL INFO Channel 07/0 : 2[6] -> 3[7] via P2P/CUMEM
+lshn-qs-pjul-8:1217539:1219210 [2] NCCL INFO Channel 08/0 : 2[6] -> 3[7] via P2P/CUMEM
+lshn-qs-pjul-8:1217539:1219210 [2] NCCL INFO Channel 09/0 : 2[6] -> 3[7] via P2P/CUMEM
+lshn-qs-pjul-8:1217539:1219210 [2] NCCL INFO Channel 10/0 : 2[6] -> 3[7] via P2P/CUMEM
+lshn-qs-pjul-8:1217539:1219210 [2] NCCL INFO Channel 11/0 : 2[6] -> 3[7] via P2P/CUMEM
+lshn-qs-pjul-8:1217539:1219210 [2] NCCL INFO Channel 12/0 : 2[6] -> 3[7] via P2P/CUMEM
+lshn-qs-pjul-8:1217539:1219210 [2] NCCL INFO Channel 13/0 : 2[6] -> 3[7] via P2P/CUMEM
+lshn-qs-pjul-8:1217539:1219210 [2] NCCL INFO Channel 14/0 : 2[6] -> 3[7] via P2P/CUMEM
+lshn-qs-pjul-8:1217539:1219210 [2] NCCL INFO Channel 15/0 : 2[6] -> 3[7] via P2P/CUMEM
+lshn-qs-pjul-8:1217539:1219210 [2] NCCL INFO Channel 16/0 : 2[6] -> 3[7] via P2P/CUMEM
+lshn-qs-pjul-8:1217539:1219210 [2] NCCL INFO Channel 17/0 : 2[6] -> 3[7] via P2P/CUMEM
+lshn-qs-pjul-8:1217539:1219210 [2] NCCL INFO Channel 18/0 : 2[6] -> 3[7] via P2P/CUMEM
+lshn-qs-pjul-8:1217539:1219210 [2] NCCL INFO Channel 19/0 : 2[6] -> 3[7] via P2P/CUMEM
+lshn-qs-pjul-8:1217539:1219210 [2] NCCL INFO Channel 20/0 : 2[6] -> 3[7] via P2P/CUMEM
+lshn-qs-pjul-8:1217539:1219210 [2] NCCL INFO Channel 21/0 : 2[6] -> 3[7] via P2P/CUMEM
+lshn-qs-pjul-8:1217539:1219210 [2] NCCL INFO Channel 22/0 : 2[6] -> 3[7] via P2P/CUMEM
+lshn-qs-pjul-8:1217539:1219210 [2] NCCL INFO Channel 23/0 : 2[6] -> 3[7] via P2P/CUMEM
+INFO 12-02 17:35:32 [llm.py:295] Supported_tasks: ('generate',)
+INFO 12-02 17:35:32 [__init__.py:36] No IOProcessor plugins requested by the model
+lshn-qs-pjul-8:1217540:1219211 [3] NCCL INFO Channel 00/0 : 3[7] -> 0[4] via P2P/CUMEM
+lshn-qs-pjul-8:1217540:1219211 [3] NCCL INFO Channel 01/0 : 3[7] -> 0[4] via P2P/CUMEM
+lshn-qs-pjul-8:1217540:1219211 [3] NCCL INFO Channel 02/0 : 3[7] -> 0[4] via P2P/CUMEM
+lshn-qs-pjul-8:1217540:1219211 [3] NCCL INFO Channel 03/0 : 3[7] -> 0[4] via P2P/CUMEM
+lshn-qs-pjul-8:1217540:1219211 [3] NCCL INFO Channel 04/0 : 3[7] -> 0[4] via P2P/CUMEM
+lshn-qs-pjul-8:1217540:1219211 [3] NCCL INFO Channel 05/0 : 3[7] -> 0[4] via P2P/CUMEM
+lshn-qs-pjul-8:1217540:1219211 [3] NCCL INFO Channel 06/0 : 3[7] -> 0[4] via P2P/CUMEM
+lshn-qs-pjul-8:1217540:1219211 [3] NCCL INFO Channel 07/0 : 3[7] -> 0[4] via P2P/CUMEM
+lshn-qs-pjul-8:1217540:1219211 [3] NCCL INFO Channel 08/0 : 3[7] -> 0[4] via P2P/CUMEM
+lshn-qs-pjul-8:1217540:1219211 [3] NCCL INFO Channel 09/0 : 3[7] -> 0[4] via P2P/CUMEM
+lshn-qs-pjul-8:1217540:1219211 [3] NCCL INFO Channel 10/0 : 3[7] -> 0[4] via P2P/CUMEM
+lshn-qs-pjul-8:1217540:1219211 [3] NCCL INFO Channel 11/0 : 3[7] -> 0[4] via P2P/CUMEM
+lshn-qs-pjul-8:1217540:1219211 [3] NCCL INFO Channel 12/0 : 3[7] -> 0[4] via P2P/CUMEM
+lshn-qs-pjul-8:1217540:1219211 [3] NCCL INFO Channel 13/0 : 3[7] -> 0[4] via P2P/CUMEM
+lshn-qs-pjul-8:1217540:1219211 [3] NCCL INFO Channel 14/0 : 3[7] -> 0[4] via P2P/CUMEM
+lshn-qs-pjul-8:1217540:1219211 [3] NCCL INFO Channel 15/0 : 3[7] -> 0[4] via P2P/CUMEM
+lshn-qs-pjul-8:1217540:1219211 [3] NCCL INFO Channel 16/0 : 3[7] -> 0[4] via P2P/CUMEM
+lshn-qs-pjul-8:1217540:1219211 [3] NCCL INFO Channel 17/0 : 3[7] -> 0[4] via P2P/CUMEM
+lshn-qs-pjul-8:1217540:1219211 [3] NCCL INFO Channel 18/0 : 3[7] -> 0[4] via P2P/CUMEM
+lshn-qs-pjul-8:1217540:1219211 [3] NCCL INFO Channel 19/0 : 3[7] -> 0[4] via P2P/CUMEM
+lshn-qs-pjul-8:1217540:1219211 [3] NCCL INFO Channel 20/0 : 3[7] -> 0[4] via P2P/CUMEM
+lshn-qs-pjul-8:1217540:1219211 [3] NCCL INFO Channel 21/0 : 3[7] -> 0[4] via P2P/CUMEM
+lshn-qs-pjul-8:1217540:1219211 [3] NCCL INFO Channel 22/0 : 3[7] -> 0[4] via P2P/CUMEM
+lshn-qs-pjul-8:1217540:1219211 [3] NCCL INFO Channel 23/0 : 3[7] -> 0[4] via P2P/CUMEM
+lshn-qs-pjul-8:1217539:1219210 [2] NCCL INFO Connected all rings, use ring PXN 0 GDR 1
+lshn-qs-pjul-8:1217540:1219211 [3] NCCL INFO Connected all rings, use ring PXN 0 GDR 1
+lshn-qs-pjul-8:1217538:1219209 [1] NCCL INFO Connected all rings, use ring PXN 0 GDR 1
+lshn-qs-pjul-8:1217537:1219208 [0] NCCL INFO Connected all rings, use ring PXN 0 GDR 1
+The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'bos_token_id': 151646, 'pad_token_id': 151643}.
+The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'bos_token_id': 151646, 'pad_token_id': 151643}.
+The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'bos_token_id': 151646, 'pad_token_id': 151643}.
+The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'bos_token_id': 151646, 'pad_token_id': 151643}.
+[OpenTinker] 2025-12-02 17:35:33,615 - accelerate.accelerator - WARNING - Gradient accumulation steps mismatch: GradientAccumulationPlugin has 1, DeepSpeed config has 8. Using DeepSpeed's value.
+lshn-qs-pjul-8:1217540:1217540 [3] NCCL INFO Comm config Blocking set to 1
+lshn-qs-pjul-8:1217538:1217538 [1] NCCL INFO Comm config Blocking set to 1
+lshn-qs-pjul-8:1217537:1217537 [0] NCCL INFO Comm config Blocking set to 1
+lshn-qs-pjul-8:1217539:1217539 [2] NCCL INFO Comm config Blocking set to 1
+lshn-qs-pjul-8:1217539:1219225 [2] NCCL INFO Assigned NET plugin Socket to comm
+lshn-qs-pjul-8:1217539:1219225 [2] NCCL INFO Using network Socket
+lshn-qs-pjul-8:1217540:1219216 [3] NCCL INFO Assigned NET plugin Socket to comm
+lshn-qs-pjul-8:1217540:1219216 [3] NCCL INFO Using network Socket
+lshn-qs-pjul-8:1217537:1219222 [0] NCCL INFO Assigned NET plugin Socket to comm
+lshn-qs-pjul-8:1217537:1219222 [0] NCCL INFO Using network Socket
+lshn-qs-pjul-8:1217538:1219219 [1] NCCL INFO Assigned NET plugin Socket to comm
+lshn-qs-pjul-8:1217538:1219219 [1] NCCL INFO Using network Socket
+lshn-qs-pjul-8:1217539:1219225 [2] NCCL INFO ncclCommSplit comm 0x1abf0240 rank 2 nranks 4 cudaDev 2 nvmlDev 6 busId 1a3000 parent 0x1dced410 splitCount 37 color 2003953581 key 2- Init START
+lshn-qs-pjul-8:1217540:1219216 [3] NCCL INFO ncclCommSplit comm 0x17706840 rank 3 nranks 4 cudaDev 3 nvmlDev 7 busId 1c7000 parent 0x1f12a6e0 splitCount 37 color 2003953581 key 3- Init START
+lshn-qs-pjul-8:1217538:1219219 [1] NCCL INFO ncclCommSplit comm 0x1c0253c0 rank 1 nranks 4 cudaDev 1 nvmlDev 5 busId 17f000 parent 0x1ed808d0 splitCount 37 color 2003953581 key 1- Init START
+lshn-qs-pjul-8:1217537:1219222 [0] NCCL INFO ncclCommSplit comm 0x1a380640 rank 0 nranks 4 cudaDev 0 nvmlDev 4 busId 109000 parent 0x1ce20510 splitCount 37 color 2003953581 key 0- Init START
+lshn-qs-pjul-8:1217540:1219216 [3] NCCL INFO MNNVL busId 0x1c7000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0
+lshn-qs-pjul-8:1217538:1219219 [1] NCCL INFO MNNVL busId 0x17f000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0
+lshn-qs-pjul-8:1217537:1219222 [0] NCCL INFO MNNVL busId 0x109000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0
+lshn-qs-pjul-8:1217539:1219225 [2] NCCL INFO MNNVL busId 0x1a3000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0
+lshn-qs-pjul-8:1217540:1219216 [3] NCCL INFO Setting affinity for GPU 7 to 48-95,144-191
+lshn-qs-pjul-8:1217539:1219225 [2] NCCL INFO Setting affinity for GPU 6 to 48-95,144-191
+lshn-qs-pjul-8:1217538:1219219 [1] NCCL INFO Setting affinity for GPU 5 to 48-95,144-191
+lshn-qs-pjul-8:1217537:1219222 [0] NCCL INFO Setting affinity for GPU 4 to 48-95,144-191
+lshn-qs-pjul-8:1217540:1219216 [3] NCCL INFO comm 0x17706840 rank 3 nRanks 4 nNodes 1 localRanks 4 localRank 3 MNNVL 0
+lshn-qs-pjul-8:1217537:1219222 [0] NCCL INFO comm 0x1a380640 rank 0 nRanks 4 nNodes 1 localRanks 4 localRank 0 MNNVL 0
+lshn-qs-pjul-8:1217539:1219225 [2] NCCL INFO comm 0x1abf0240 rank 2 nRanks 4 nNodes 1 localRanks 4 localRank 2 MNNVL 0
+lshn-qs-pjul-8:1217538:1219219 [1] NCCL INFO comm 0x1c0253c0 rank 1 nRanks 4 nNodes 1 localRanks 4 localRank 1 MNNVL 0
+lshn-qs-pjul-8:1217540:1219216 [3] NCCL INFO Trees [0] -1/-1/-1->3->2 [1] -1/-1/-1->3->2 [2] -1/-1/-1->3->2 [3] -1/-1/-1->3->2 [4] -1/-1/-1->3->2 [5] -1/-1/-1->3->2 [6] -1/-1/-1->3->2 [7] -1/-1/-1->3->2 [8] -1/-1/-1->3->2 [9] -1/-1/-1->3->2 [10] -1/-1/-1->3->2 [11] -1/-1/-1->3->2 [12] -1/-1/-1->3->2 [13] -1/-1/-1->3->2 [14] -1/-1/-1->3->2 [15] -1/-1/-1->3->2 [16] -1/-1/-1->3->2 [17] -1/-1/-1->3->2 [18] -1/-1/-1->3->2 [19] -1/-1/-1->3->2 [20] -1/-1/-1->3->2 [21] -1/-1/-1->3->2 [22] -1/-1/-1->3->2 [23] -1/-1/-1->3->2
+lshn-qs-pjul-8:1217538:1219219 [1] NCCL INFO Trees [0] 2/-1/-1->1->0 [1] 2/-1/-1->1->0 [2] 2/-1/-1->1->0 [3] 2/-1/-1->1->0 [4] 2/-1/-1->1->0 [5] 2/-1/-1->1->0 [6] 2/-1/-1->1->0 [7] 2/-1/-1->1->0 [8] 2/-1/-1->1->0 [9] 2/-1/-1->1->0 [10] 2/-1/-1->1->0 [11] 2/-1/-1->1->0 [12] 2/-1/-1->1->0 [13] 2/-1/-1->1->0 [14] 2/-1/-1->1->0 [15] 2/-1/-1->1->0 [16] 2/-1/-1->1->0 [17] 2/-1/-1->1->0 [18] 2/-1/-1->1->0 [19] 2/-1/-1->1->0 [20] 2/-1/-1->1->0 [21] 2/-1/-1->1->0 [22] 2/-1/-1->1->0 [23] 2/-1/-1->1->0
+lshn-qs-pjul-8:1217537:1219222 [0] NCCL INFO Channel 00/24 : 0 1 2 3
+lshn-qs-pjul-8:1217539:1219225 [2] NCCL INFO Trees [0] 3/-1/-1->2->1 [1] 3/-1/-1->2->1 [2] 3/-1/-1->2->1 [3] 3/-1/-1->2->1 [4] 3/-1/-1->2->1 [5] 3/-1/-1->2->1 [6] 3/-1/-1->2->1 [7] 3/-1/-1->2->1 [8] 3/-1/-1->2->1 [9] 3/-1/-1->2->1 [10] 3/-1/-1->2->1 [11] 3/-1/-1->2->1 [12] 3/-1/-1->2->1 [13] 3/-1/-1->2->1 [14] 3/-1/-1->2->1 [15] 3/-1/-1->2->1 [16] 3/-1/-1->2->1 [17] 3/-1/-1->2->1 [18] 3/-1/-1->2->1 [19] 3/-1/-1->2->1 [20] 3/-1/-1->2->1 [21] 3/-1/-1->2->1 [22] 3/-1/-1->2->1 [23] 3/-1/-1->2->1
+lshn-qs-pjul-8:1217539:1219225 [2] NCCL INFO P2P Chunksize set to 524288
+lshn-qs-pjul-8:1217540:1219216 [3] NCCL INFO P2P Chunksize set to 524288
+lshn-qs-pjul-8:1217538:1219219 [1] NCCL INFO P2P Chunksize set to 524288
+lshn-qs-pjul-8:1217537:1219222 [0] NCCL INFO Channel 01/24 : 0 1 2 3
+lshn-qs-pjul-8:1217537:1219222 [0] NCCL INFO Channel 02/24 : 0 1 2 3
+lshn-qs-pjul-8:1217537:1219222 [0] NCCL INFO Channel 03/24 : 0 1 2 3
+lshn-qs-pjul-8:1217537:1219222 [0] NCCL INFO Channel 04/24 : 0 1 2 3
+lshn-qs-pjul-8:1217537:1219222 [0] NCCL INFO Channel 05/24 : 0 1 2 3
+lshn-qs-pjul-8:1217537:1219222 [0] NCCL INFO Channel 06/24 : 0 1 2 3
+lshn-qs-pjul-8:1217537:1219222 [0] NCCL INFO Channel 07/24 : 0 1 2 3
+lshn-qs-pjul-8:1217537:1219222 [0] NCCL INFO Channel 08/24 : 0 1 2 3
+lshn-qs-pjul-8:1217537:1219222 [0] NCCL INFO Channel 09/24 : 0 1 2 3
+lshn-qs-pjul-8:1217537:1219222 [0] NCCL INFO Channel 10/24 : 0 1 2 3
+lshn-qs-pjul-8:1217537:1219222 [0] NCCL INFO Channel 11/24 : 0 1 2 3
+lshn-qs-pjul-8:1217537:1219222 [0] NCCL INFO Channel 12/24 : 0 1 2 3
+lshn-qs-pjul-8:1217537:1219222 [0] NCCL INFO Channel 13/24 : 0 1 2 3
+lshn-qs-pjul-8:1217537:1219222 [0] NCCL INFO Channel 14/24 : 0 1 2 3
+lshn-qs-pjul-8:1217537:1219222 [0] NCCL INFO Channel 15/24 : 0 1 2 3
+lshn-qs-pjul-8:1217537:1219222 [0] NCCL INFO Channel 16/24 : 0 1 2 3
+lshn-qs-pjul-8:1217537:1219222 [0] NCCL INFO Channel 17/24 : 0 1 2 3
+lshn-qs-pjul-8:1217537:1219222 [0] NCCL INFO Channel 18/24 : 0 1 2 3
+lshn-qs-pjul-8:1217537:1219222 [0] NCCL INFO Channel 19/24 : 0 1 2 3
+lshn-qs-pjul-8:1217537:1219222 [0] NCCL INFO Channel 20/24 : 0 1 2 3
+lshn-qs-pjul-8:1217537:1219222 [0] NCCL INFO Channel 21/24 : 0 1 2 3
+lshn-qs-pjul-8:1217537:1219222 [0] NCCL INFO Channel 22/24 : 0 1 2 3
+lshn-qs-pjul-8:1217537:1219222 [0] NCCL INFO Channel 23/24 : 0 1 2 3
+lshn-qs-pjul-8:1217537:1219222 [0] NCCL INFO Trees [0] 1/-1/-1->0->-1 [1] 1/-1/-1->0->-1 [2] 1/-1/-1->0->-1 [3] 1/-1/-1->0->-1 [4] 1/-1/-1->0->-1 [5] 1/-1/-1->0->-1 [6] 1/-1/-1->0->-1 [7] 1/-1/-1->0->-1 [8] 1/-1/-1->0->-1 [9] 1/-1/-1->0->-1 [10] 1/-1/-1->0->-1 [11] 1/-1/-1->0->-1 [12] 1/-1/-1->0->-1 [13] 1/-1/-1->0->-1 [14] 1/-1/-1->0->-1 [15] 1/-1/-1->0->-1 [16] 1/-1/-1->0->-1 [17] 1/-1/-1->0->-1 [18] 1/-1/-1->0->-1 [19] 1/-1/-1->0->-1 [20] 1/-1/-1->0->-1 [21] 1/-1/-1->0->-1 [22] 1/-1/-1->0->-1 [23] 1/-1/-1->0->-1
+lshn-qs-pjul-8:1217537:1219222 [0] NCCL INFO P2P Chunksize set to 524288
+lshn-qs-pjul-8:1217540:1219226 [3] NCCL INFO [Proxy Service] Device 3 CPU core 61
+lshn-qs-pjul-8:1217540:1219227 [3] NCCL INFO [Proxy Service UDS] Device 3 CPU core 68
+lshn-qs-pjul-8:1217538:1219228 [1] NCCL INFO [Proxy Service] Device 1 CPU core 151
+lshn-qs-pjul-8:1217538:1219229 [1] NCCL INFO [Proxy Service UDS] Device 1 CPU core 82
+lshn-qs-pjul-8:1217537:1219222 [0] NCCL INFO Check P2P Type isAllDirectP2p 1 directMode 0
+lshn-qs-pjul-8:1217537:1219230 [0] NCCL INFO [Proxy Service] Device 0 CPU core 179
+lshn-qs-pjul-8:1217537:1219231 [0] NCCL INFO [Proxy Service UDS] Device 0 CPU core 84
+lshn-qs-pjul-8:1217539:1219232 [2] NCCL INFO [Proxy Service] Device 2 CPU core 86
+lshn-qs-pjul-8:1217539:1219233 [2] NCCL INFO [Proxy Service UDS] Device 2 CPU core 87
+lshn-qs-pjul-8:1217538:1219219 [1] NCCL INFO threadThresholds 8/8/64 | 32/8/64 | 512 | 512
+lshn-qs-pjul-8:1217538:1219219 [1] NCCL INFO 24 coll channels, 24 collnet channels, 0 nvls channels, 32 p2p channels, 32 p2p channels per peer
+lshn-qs-pjul-8:1217540:1219216 [3] NCCL INFO threadThresholds 8/8/64 | 32/8/64 | 512 | 512
+lshn-qs-pjul-8:1217540:1219216 [3] NCCL INFO 24 coll channels, 24 collnet channels, 0 nvls channels, 32 p2p channels, 32 p2p channels per peer
+lshn-qs-pjul-8:1217537:1219222 [0] NCCL INFO threadThresholds 8/8/64 | 32/8/64 | 512 | 512
+lshn-qs-pjul-8:1217539:1219225 [2] NCCL INFO threadThresholds 8/8/64 | 32/8/64 | 512 | 512
+lshn-qs-pjul-8:1217537:1219222 [0] NCCL INFO 24 coll channels, 24 collnet channels, 0 nvls channels, 32 p2p channels, 32 p2p channels per peer
+lshn-qs-pjul-8:1217539:1219225 [2] NCCL INFO 24 coll channels, 24 collnet channels, 0 nvls channels, 32 p2p channels, 32 p2p channels per peer
+lshn-qs-pjul-8:1217537:1219222 [0] NCCL INFO CC Off, workFifoBytes 1048576
+lshn-qs-pjul-8:1217539:1219225 [2] NCCL INFO ncclCommSplit comm 0x1abf0240 rank 2 nranks 4 cudaDev 2 nvmlDev 6 busId 1a3000 parent 0x1dced410 splitCount 37 color 2003953581 key 2 - Init COMPLETE
+lshn-qs-pjul-8:1217537:1219222 [0] NCCL INFO ncclCommSplit comm 0x1a380640 rank 0 nranks 4 cudaDev 0 nvmlDev 4 busId 109000 parent 0x1ce20510 splitCount 37 color 2003953581 key 0 - Init COMPLETE
+lshn-qs-pjul-8:1217539:1219225 [2] NCCL INFO Init timings - ncclCommSplit: rank 2 nranks 4 total 0.08 (kernels 0.00, alloc 0.00, bootstrap 0.00, allgathers 0.01, topo 0.03, graphs 0.01, connections 0.03, rest 0.01)
+lshn-qs-pjul-8:1217537:1219222 [0] NCCL INFO Init timings - ncclCommSplit: rank 0 nranks 4 total 0.13 (kernels 0.00, alloc 0.00, bootstrap 0.00, allgathers 0.01, topo 0.03, graphs 0.01, connections 0.03, rest 0.06)
+lshn-qs-pjul-8:1217540:1219216 [3] NCCL INFO ncclCommSplit comm 0x17706840 rank 3 nranks 4 cudaDev 3 nvmlDev 7 busId 1c7000 parent 0x1f12a6e0 splitCount 37 color 2003953581 key 3 - Init COMPLETE
+lshn-qs-pjul-8:1217538:1219219 [1] NCCL INFO ncclCommSplit comm 0x1c0253c0 rank 1 nranks 4 cudaDev 1 nvmlDev 5 busId 17f000 parent 0x1ed808d0 splitCount 37 color 2003953581 key 1 - Init COMPLETE
+lshn-qs-pjul-8:1217540:1219216 [3] NCCL INFO Init timings - ncclCommSplit: rank 3 nranks 4 total 0.17 (kernels 0.00, alloc 0.00, bootstrap 0.00, allgathers 0.01, topo 0.03, graphs 0.01, connections 0.03, rest 0.10)
+lshn-qs-pjul-8:1217538:1219219 [1] NCCL INFO Init timings - ncclCommSplit: rank 1 nranks 4 total 0.13 (kernels 0.00, alloc 0.00, bootstrap 0.00, allgathers 0.01, topo 0.03, graphs 0.01, connections 0.02, rest 0.06)
+lshn-qs-pjul-8:1217538:1219236 [1] NCCL INFO Channel 00/0 : 1[5] -> 2[6] via P2P/CUMEM
+lshn-qs-pjul-8:1217537:1219235 [0] NCCL INFO Channel 00/0 : 0[4] -> 1[5] via P2P/CUMEM
+lshn-qs-pjul-8:1217538:1219236 [1] NCCL INFO Channel 01/0 : 1[5] -> 2[6] via P2P/CUMEM
+lshn-qs-pjul-8:1217539:1219234 [2] NCCL INFO Channel 00/0 : 2[6] -> 3[7] via P2P/CUMEM
+lshn-qs-pjul-8:1217537:1219235 [0] NCCL INFO Channel 01/0 : 0[4] -> 1[5] via P2P/CUMEM
+lshn-qs-pjul-8:1217540:1219237 [3] NCCL INFO Channel 00/0 : 3[7] -> 0[4] via P2P/CUMEM
+lshn-qs-pjul-8:1217538:1219236 [1] NCCL INFO Channel 02/0 : 1[5] -> 2[6] via P2P/CUMEM
+lshn-qs-pjul-8:1217539:1219234 [2] NCCL INFO Channel 01/0 : 2[6] -> 3[7] via P2P/CUMEM
+lshn-qs-pjul-8:1217537:1219235 [0] NCCL INFO Channel 02/0 : 0[4] -> 1[5] via P2P/CUMEM
+lshn-qs-pjul-8:1217540:1219237 [3] NCCL INFO Channel 01/0 : 3[7] -> 0[4] via P2P/CUMEM
+lshn-qs-pjul-8:1217538:1219236 [1] NCCL INFO Channel 03/0 : 1[5] -> 2[6] via P2P/CUMEM
+lshn-qs-pjul-8:1217537:1219235 [0] NCCL INFO Channel 03/0 : 0[4] -> 1[5] via P2P/CUMEM
+lshn-qs-pjul-8:1217539:1219234 [2] NCCL INFO Channel 02/0 : 2[6] -> 3[7] via P2P/CUMEM
+lshn-qs-pjul-8:1217540:1219237 [3] NCCL INFO Channel 02/0 : 3[7] -> 0[4] via P2P/CUMEM
+lshn-qs-pjul-8:1217538:1219236 [1] NCCL INFO Channel 04/0 : 1[5] -> 2[6] via P2P/CUMEM
+lshn-qs-pjul-8:1217537:1219235 [0] NCCL INFO Channel 04/0 : 0[4] -> 1[5] via P2P/CUMEM
+lshn-qs-pjul-8:1217540:1219237 [3] NCCL INFO Channel 03/0 : 3[7] -> 0[4] via P2P/CUMEM
+lshn-qs-pjul-8:1217538:1219236 [1] NCCL INFO Channel 05/0 : 1[5] -> 2[6] via P2P/CUMEM
+lshn-qs-pjul-8:1217539:1219234 [2] NCCL INFO Channel 03/0 : 2[6] -> 3[7] via P2P/CUMEM
+lshn-qs-pjul-8:1217537:1219235 [0] NCCL INFO Channel 05/0 : 0[4] -> 1[5] via P2P/CUMEM
+lshn-qs-pjul-8:1217540:1219237 [3] NCCL INFO Channel 04/0 : 3[7] -> 0[4] via P2P/CUMEM
+lshn-qs-pjul-8:1217539:1219234 [2] NCCL INFO Channel 04/0 : 2[6] -> 3[7] via P2P/CUMEM
+lshn-qs-pjul-8:1217538:1219236 [1] NCCL INFO Channel 06/0 : 1[5] -> 2[6] via P2P/CUMEM
+lshn-qs-pjul-8:1217537:1219235 [0] NCCL INFO Channel 06/0 : 0[4] -> 1[5] via P2P/CUMEM
+lshn-qs-pjul-8:1217540:1219237 [3] NCCL INFO Channel 05/0 : 3[7] -> 0[4] via P2P/CUMEM
+lshn-qs-pjul-8:1217538:1219236 [1] NCCL INFO Channel 07/0 : 1[5] -> 2[6] via P2P/CUMEM
+lshn-qs-pjul-8:1217537:1219235 [0] NCCL INFO Channel 07/0 : 0[4] -> 1[5] via P2P/CUMEM
+lshn-qs-pjul-8:1217539:1219234 [2] NCCL INFO Channel 05/0 : 2[6] -> 3[7] via P2P/CUMEM
+lshn-qs-pjul-8:1217540:1219237 [3] NCCL INFO Channel 06/0 : 3[7] -> 0[4] via P2P/CUMEM
+lshn-qs-pjul-8:1217538:1219236 [1] NCCL INFO Channel 08/0 : 1[5] -> 2[6] via P2P/CUMEM
+lshn-qs-pjul-8:1217537:1219235 [0] NCCL INFO Channel 08/0 : 0[4] -> 1[5] via P2P/CUMEM
+lshn-qs-pjul-8:1217539:1219234 [2] NCCL INFO Channel 06/0 : 2[6] -> 3[7] via P2P/CUMEM
+lshn-qs-pjul-8:1217540:1219237 [3] NCCL INFO Channel 07/0 : 3[7] -> 0[4] via P2P/CUMEM
+lshn-qs-pjul-8:1217538:1219236 [1] NCCL INFO Channel 09/0 : 1[5] -> 2[6] via P2P/CUMEM
+lshn-qs-pjul-8:1217537:1219235 [0] NCCL INFO Channel 09/0 : 0[4] -> 1[5] via P2P/CUMEM
+lshn-qs-pjul-8:1217539:1219234 [2] NCCL INFO Channel 07/0 : 2[6] -> 3[7] via P2P/CUMEM
+lshn-qs-pjul-8:1217540:1219237 [3] NCCL INFO Channel 08/0 : 3[7] -> 0[4] via P2P/CUMEM
+lshn-qs-pjul-8:1217538:1219236 [1] NCCL INFO Channel 10/0 : 1[5] -> 2[6] via P2P/CUMEM
+lshn-qs-pjul-8:1217537:1219235 [0] NCCL INFO Channel 10/0 : 0[4] -> 1[5] via P2P/CUMEM
+lshn-qs-pjul-8:1217539:1219234 [2] NCCL INFO Channel 08/0 : 2[6] -> 3[7] via P2P/CUMEM
+lshn-qs-pjul-8:1217540:1219237 [3] NCCL INFO Channel 09/0 : 3[7] -> 0[4] via P2P/CUMEM
+lshn-qs-pjul-8:1217538:1219236 [1] NCCL INFO Channel 11/0 : 1[5] -> 2[6] via P2P/CUMEM
+lshn-qs-pjul-8:1217537:1219235 [0] NCCL INFO Channel 11/0 : 0[4] -> 1[5] via P2P/CUMEM
+lshn-qs-pjul-8:1217539:1219234 [2] NCCL INFO Channel 09/0 : 2[6] -> 3[7] via P2P/CUMEM
+lshn-qs-pjul-8:1217540:1219237 [3] NCCL INFO Channel 10/0 : 3[7] -> 0[4] via P2P/CUMEM
+lshn-qs-pjul-8:1217538:1219236 [1] NCCL INFO Channel 12/0 : 1[5] -> 2[6] via P2P/CUMEM
+lshn-qs-pjul-8:1217537:1219235 [0] NCCL INFO Channel 12/0 : 0[4] -> 1[5] via P2P/CUMEM
+lshn-qs-pjul-8:1217539:1219234 [2] NCCL INFO Channel 10/0 : 2[6] -> 3[7] via P2P/CUMEM
+lshn-qs-pjul-8:1217540:1219237 [3] NCCL INFO Channel 11/0 : 3[7] -> 0[4] via P2P/CUMEM
+lshn-qs-pjul-8:1217538:1219236 [1] NCCL INFO Channel 13/0 : 1[5] -> 2[6] via P2P/CUMEM
+lshn-qs-pjul-8:1217537:1219235 [0] NCCL INFO Channel 13/0 : 0[4] -> 1[5] via P2P/CUMEM
+lshn-qs-pjul-8:1217539:1219234 [2] NCCL INFO Channel 11/0 : 2[6] -> 3[7] via P2P/CUMEM
+lshn-qs-pjul-8:1217540:1219237 [3] NCCL INFO Channel 12/0 : 3[7] -> 0[4] via P2P/CUMEM
+lshn-qs-pjul-8:1217538:1219236 [1] NCCL INFO Channel 14/0 : 1[5] -> 2[6] via P2P/CUMEM
+lshn-qs-pjul-8:1217537:1219235 [0] NCCL INFO Channel 14/0 : 0[4] -> 1[5] via P2P/CUMEM
+lshn-qs-pjul-8:1217539:1219234 [2] NCCL INFO Channel 12/0 : 2[6] -> 3[7] via P2P/CUMEM
+lshn-qs-pjul-8:1217540:1219237 [3] NCCL INFO Channel 13/0 : 3[7] -> 0[4] via P2P/CUMEM
+lshn-qs-pjul-8:1217538:1219236 [1] NCCL INFO Channel 15/0 : 1[5] -> 2[6] via P2P/CUMEM
+lshn-qs-pjul-8:1217537:1219235 [0] NCCL INFO Channel 15/0 : 0[4] -> 1[5] via P2P/CUMEM
+lshn-qs-pjul-8:1217539:1219234 [2] NCCL INFO Channel 13/0 : 2[6] -> 3[7] via P2P/CUMEM
+lshn-qs-pjul-8:1217540:1219237 [3] NCCL INFO Channel 14/0 : 3[7] -> 0[4] via P2P/CUMEM
+lshn-qs-pjul-8:1217538:1219236 [1] NCCL INFO Channel 16/0 : 1[5] -> 2[6] via P2P/CUMEM
+lshn-qs-pjul-8:1217537:1219235 [0] NCCL INFO Channel 16/0 : 0[4] -> 1[5] via P2P/CUMEM
+lshn-qs-pjul-8:1217539:1219234 [2] NCCL INFO Channel 14/0 : 2[6] -> 3[7] via P2P/CUMEM
+lshn-qs-pjul-8:1217540:1219237 [3] NCCL INFO Channel 15/0 : 3[7] -> 0[4] via P2P/CUMEM
+lshn-qs-pjul-8:1217538:1219236 [1] NCCL INFO Channel 17/0 : 1[5] -> 2[6] via P2P/CUMEM
+lshn-qs-pjul-8:1217537:1219235 [0] NCCL INFO Channel 17/0 : 0[4] -> 1[5] via P2P/CUMEM
+lshn-qs-pjul-8:1217539:1219234 [2] NCCL INFO Channel 15/0 : 2[6] -> 3[7] via P2P/CUMEM
+lshn-qs-pjul-8:1217540:1219237 [3] NCCL INFO Channel 16/0 : 3[7] -> 0[4] via P2P/CUMEM
+lshn-qs-pjul-8:1217538:1219236 [1] NCCL INFO Channel 18/0 : 1[5] -> 2[6] via P2P/CUMEM
+lshn-qs-pjul-8:1217537:1219235 [0] NCCL INFO Channel 18/0 : 0[4] -> 1[5] via P2P/CUMEM
+lshn-qs-pjul-8:1217539:1219234 [2] NCCL INFO Channel 16/0 : 2[6] -> 3[7] via P2P/CUMEM
+lshn-qs-pjul-8:1217540:1219237 [3] NCCL INFO Channel 17/0 : 3[7] -> 0[4] via P2P/CUMEM
+lshn-qs-pjul-8:1217538:1219236 [1] NCCL INFO Channel 19/0 : 1[5] -> 2[6] via P2P/CUMEM
+lshn-qs-pjul-8:1217537:1219235 [0] NCCL INFO Channel 19/0 : 0[4] -> 1[5] via P2P/CUMEM
+lshn-qs-pjul-8:1217539:1219234 [2] NCCL INFO Channel 17/0 : 2[6] -> 3[7] via P2P/CUMEM
+lshn-qs-pjul-8:1217540:1219237 [3] NCCL INFO Channel 18/0 : 3[7] -> 0[4] via P2P/CUMEM
+lshn-qs-pjul-8:1217538:1219236 [1] NCCL INFO Channel 20/0 : 1[5] -> 2[6] via P2P/CUMEM
+lshn-qs-pjul-8:1217537:1219235 [0] NCCL INFO Channel 20/0 : 0[4] -> 1[5] via P2P/CUMEM
+lshn-qs-pjul-8:1217539:1219234 [2] NCCL INFO Channel 18/0 : 2[6] -> 3[7] via P2P/CUMEM
+lshn-qs-pjul-8:1217540:1219237 [3] NCCL INFO Channel 19/0 : 3[7] -> 0[4] via P2P/CUMEM
+lshn-qs-pjul-8:1217538:1219236 [1] NCCL INFO Channel 21/0 : 1[5] -> 2[6] via P2P/CUMEM
+lshn-qs-pjul-8:1217537:1219235 [0] NCCL INFO Channel 21/0 : 0[4] -> 1[5] via P2P/CUMEM
+lshn-qs-pjul-8:1217539:1219234 [2] NCCL INFO Channel 19/0 : 2[6] -> 3[7] via P2P/CUMEM
+lshn-qs-pjul-8:1217540:1219237 [3] NCCL INFO Channel 20/0 : 3[7] -> 0[4] via P2P/CUMEM
+lshn-qs-pjul-8:1217538:1219236 [1] NCCL INFO Channel 22/0 : 1[5] -> 2[6] via P2P/CUMEM
+lshn-qs-pjul-8:1217537:1219235 [0] NCCL INFO Channel 22/0 : 0[4] -> 1[5] via P2P/CUMEM
+lshn-qs-pjul-8:1217539:1219234 [2] NCCL INFO Channel 20/0 : 2[6] -> 3[7] via P2P/CUMEM
+lshn-qs-pjul-8:1217540:1219237 [3] NCCL INFO Channel 21/0 : 3[7] -> 0[4] via P2P/CUMEM
+lshn-qs-pjul-8:1217538:1219236 [1] NCCL INFO Channel 23/0 : 1[5] -> 2[6] via P2P/CUMEM
+lshn-qs-pjul-8:1217537:1219235 [0] NCCL INFO Channel 23/0 : 0[4] -> 1[5] via P2P/CUMEM
+lshn-qs-pjul-8:1217539:1219234 [2] NCCL INFO Channel 21/0 : 2[6] -> 3[7] via P2P/CUMEM
+lshn-qs-pjul-8:1217540:1219237 [3] NCCL INFO Channel 22/0 : 3[7] -> 0[4] via P2P/CUMEM
+lshn-qs-pjul-8:1217539:1219234 [2] NCCL INFO Channel 22/0 : 2[6] -> 3[7] via P2P/CUMEM
+lshn-qs-pjul-8:1217540:1219237 [3] NCCL INFO Channel 23/0 : 3[7] -> 0[4] via P2P/CUMEM
+lshn-qs-pjul-8:1217539:1219234 [2] NCCL INFO Channel 23/0 : 2[6] -> 3[7] via P2P/CUMEM
+lshn-qs-pjul-8:1217539:1219234 [2] NCCL INFO Connected all rings, use ring PXN 0 GDR 1
+lshn-qs-pjul-8:1217538:1219236 [1] NCCL INFO Connected all rings, use ring PXN 0 GDR 1
+lshn-qs-pjul-8:1217537:1219235 [0] NCCL INFO Connected all rings, use ring PXN 0 GDR 1
+lshn-qs-pjul-8:1217540:1219237 [3] NCCL INFO Connected all rings, use ring PXN 0 GDR 1
+INFO 12-02 17:35:36 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 17:35:36 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 17:35:36 [block_pool.py:292] Successfully reset prefix cache
+wandb: WARNING The `run_name` is currently set to the same value as `TrainingArguments.output_dir`. If this was not intended, please specify a different run name by setting the `TrainingArguments.run_name` parameter.
+
+  0%|          | 0/1024 [00:00<?, ?it/s][AINFO 12-02 17:35:36 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 17:35:38 [chat_utils.py:538] Detected the chat template content format to be 'string'. You can set `--chat-template-content-format` to override this.
+INFO 12-02 17:35:38 [chat_utils.py:538] Detected the chat template content format to be 'string'. You can set `--chat-template-content-format` to override this.
+INFO 12-02 17:35:39 [chat_utils.py:538] Detected the chat template content format to be 'string'. You can set `--chat-template-content-format` to override this.
+INFO 12-02 17:35:40 [chat_utils.py:538] Detected the chat template content format to be 'string'. You can set `--chat-template-content-format` to override this.
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 50%|█████     | 513/1024 [02:50<02:50,  3.00it/s][A
+                                                  [A{'loss': 0.0261, 'grad_norm': 0.002207641489803791, 'learning_rate': 1e-05, 'num_tokens': 431914122.0, 'completions/mean_length': 6347.03125, 'completions/min_length': 514.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0546875, 'completions/mean_terminated_length': 5766.3798828125, 'completions/min_terminated_length': 514.0, 'completions/max_terminated_length': 15874.0, 'rewards/accuracy_reward/mean': 0.4765625, 'rewards/accuracy_reward/std': 0.5014128684997559, 'reward': 0.4765625, 'reward_std': 0.21648237109184265, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.01931341364979744, 'sampling/sampling_logp_difference/max': 6.781994819641113, 'sampling/importance_sampling_ratio/min': 0.0011340104974806309, 'sampling/importance_sampling_ratio/mean': 0.999993085861206, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9512053951621056, 'clip_ratio/low_mean': 2.538728870149498e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 1.3562190588345402e-06, 'clip_ratio/high_max': 5.424876235338161e-06, 'clip_ratio/region_mean': 2.674350776032952e-05, 'epoch': 0.47}
+
+ 50%|█████     | 513/1024 [02:50<02:50,  3.00it/s][AINFO 12-02 17:38:27 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 17:38:27 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 17:38:27 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 17:38:27 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 50%|█████     | 514/1024 [05:31<06:37,  1.28it/s][A
+                                                  [A{'loss': 0.0637, 'grad_norm': 0.0018057655543088913, 'learning_rate': 1e-05, 'num_tokens': 432663249.0, 'completions/mean_length': 5719.8671875, 'completions/min_length': 14.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 5635.8974609375, 'completions/min_terminated_length': 14.0, 'completions/max_terminated_length': 14757.0, 'rewards/accuracy_reward/mean': 0.4921875, 'rewards/accuracy_reward/std': 0.5019033551216125, 'reward': 0.4921875, 'reward_std': 0.32035762071609497, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.019999589771032333, 'sampling/sampling_logp_difference/max': 8.730670928955078, 'sampling/importance_sampling_ratio/min': 0.00016155402408912778, 'sampling/importance_sampling_ratio/mean': 0.9999573230743408, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9754309803247452, 'clip_ratio/low_mean': 4.058695458297734e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 3.0820788765595353e-06, 'clip_ratio/high_max': 1.2328315506238141e-05, 'clip_ratio/region_mean': 4.366903374375397e-05, 'epoch': 0.47}
+
+ 50%|█████     | 514/1024 [05:31<06:37,  1.28it/s][AINFO 12-02 17:41:08 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 17:41:08 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 17:41:08 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 17:41:08 [block_pool.py:292] Successfully reset prefix cache
+
+ 50%|█████     | 515/1024 [08:18<12:09,  1.43s/it][A
+                                                  [A{'loss': 0.0843, 'grad_norm': 0.002828414784744382, 'learning_rate': 1e-05, 'num_tokens': 433448874.0, 'completions/mean_length': 5969.1328125, 'completions/min_length': 367.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 5803.81787109375, 'completions/min_terminated_length': 367.0, 'completions/max_terminated_length': 16206.0, 'rewards/accuracy_reward/mean': 0.4375, 'rewards/accuracy_reward/std': 0.49802759289741516, 'reward': 0.4375, 'reward_std': 0.27328526973724365, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.019238140434026718, 'sampling/sampling_logp_difference/max': 7.749990463256836, 'sampling/importance_sampling_ratio/min': 0.00043074661516584456, 'sampling/importance_sampling_ratio/mean': 0.9999672174453735, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9498241171240807, 'clip_ratio/low_mean': 3.319342158647487e-05, 'clip_ratio/low_min': 3.644846174211125e-06, 'clip_ratio/high_mean': 8.369293027499225e-06, 'clip_ratio/high_max': 3.34771721099969e-05, 'clip_ratio/region_mean': 4.1562714159226744e-05, 'epoch': 0.47}
+
+ 50%|█████     | 515/1024 [08:18<12:09,  1.43s/it][AINFO 12-02 17:43:54 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 17:43:54 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 17:43:54 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 17:43:54 [block_pool.py:292] Successfully reset prefix cache
+
+ 50%|█████     | 516/1024 [10:52<19:24,  2.29s/it][A
+                                                  [A{'loss': 0.0535, 'grad_norm': 0.0019250004552304745, 'learning_rate': 1e-05, 'num_tokens': 434338609.0, 'completions/mean_length': 6808.3671875, 'completions/min_length': 857.0, 'completions/max_length': 14714.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 6808.3671875, 'completions/min_terminated_length': 857.0, 'completions/max_terminated_length': 14714.0, 'rewards/accuracy_reward/mean': 0.4921875, 'rewards/accuracy_reward/std': 0.5019033551216125, 'reward': 0.4921875, 'reward_std': 0.36007601022720337, 'frac_reward_zero_std': 0.1875, 'sampling/sampling_logp_difference/mean': 0.01927364431321621, 'sampling/sampling_logp_difference/max': 8.25802230834961, 'sampling/importance_sampling_ratio/min': 0.00025917106540873647, 'sampling/importance_sampling_ratio/mean': 0.9999670386314392, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9247330650687218, 'clip_ratio/low_mean': 4.7241341690096306e-05, 'clip_ratio/low_min': 4.075511242263019e-06, 'clip_ratio/high_mean': 7.50266553950496e-06, 'clip_ratio/high_max': 2.4458067855448462e-05, 'clip_ratio/region_mean': 5.4744006320106564e-05, 'epoch': 0.47}
+
+ 50%|█████     | 516/1024 [10:52<19:24,  2.29s/it][AINFO 12-02 17:46:29 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 17:46:29 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 17:46:29 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 17:46:29 [block_pool.py:292] Successfully reset prefix cache
+
+ 50%|█████     | 517/1024 [13:27<29:38,  3.51s/it][A
+                                                  [A{'loss': 0.0405, 'grad_norm': 0.0022343189921230078, 'learning_rate': 1e-05, 'num_tokens': 435145247.0, 'completions/mean_length': 6119.921875, 'completions/min_length': 171.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 6039.1025390625, 'completions/min_terminated_length': 171.0, 'completions/max_terminated_length': 15500.0, 'rewards/accuracy_reward/mean': 0.5, 'rewards/accuracy_reward/std': 0.5019646286964417, 'reward': 0.5, 'reward_std': 0.2467075139284134, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.019618261605501175, 'sampling/sampling_logp_difference/max': 9.095551490783691, 'sampling/importance_sampling_ratio/min': 0.00011216365965083241, 'sampling/importance_sampling_ratio/mean': 0.9998999834060669, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9210109040141106, 'clip_ratio/low_mean': 1.523887078747066e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 5.167569042896503e-06, 'clip_ratio/high_max': 2.067027617158601e-05, 'clip_ratio/region_mean': 2.0406439944054e-05, 'epoch': 0.48}
+
+ 50%|█████     | 517/1024 [13:27<29:38,  3.51s/it][AINFO 12-02 17:49:04 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 17:49:04 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 17:49:04 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 17:49:04 [block_pool.py:292] Successfully reset prefix cache
+
+ 51%|█████     | 518/1024 [16:12<44:51,  5.32s/it][A
+                                                  [A{'loss': 0.0634, 'grad_norm': 0.00324260420165956, 'learning_rate': 1e-05, 'num_tokens': 435964383.0, 'completions/mean_length': 6259.0625, 'completions/min_length': 1087.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 6179.33837890625, 'completions/min_terminated_length': 1087.0, 'completions/max_terminated_length': 16282.0, 'rewards/accuracy_reward/mean': 0.5, 'rewards/accuracy_reward/std': 0.5019646286964417, 'reward': 0.5, 'reward_std': 0.3424547016620636, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.019678015261888504, 'sampling/sampling_logp_difference/max': 11.06246566772461, 'sampling/importance_sampling_ratio/min': 1.5690335203544237e-05, 'sampling/importance_sampling_ratio/mean': 0.9999647736549377, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9430939853191376, 'clip_ratio/low_mean': 3.679497240227647e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 4.821650009034784e-06, 'clip_ratio/high_max': 1.9286600036139134e-05, 'clip_ratio/region_mean': 4.1616622866058606e-05, 'epoch': 0.48}
+
+ 51%|█████     | 518/1024 [16:12<44:51,  5.32s/it][AINFO 12-02 17:51:48 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 17:51:48 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 17:51:48 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 17:51:48 [block_pool.py:292] Successfully reset prefix cache
+
+ 51%|█████     | 519/1024 [18:49<1:04:56,  7.72s/it][A
+                                                    [A{'loss': 0.085, 'grad_norm': 0.0017788221593946218, 'learning_rate': 1e-05, 'num_tokens': 436866830.0, 'completions/mean_length': 6898.8671875, 'completions/min_length': 701.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 6748.31005859375, 'completions/min_terminated_length': 701.0, 'completions/max_terminated_length': 14682.0, 'rewards/accuracy_reward/mean': 0.328125, 'rewards/accuracy_reward/std': 0.4713755249977112, 'reward': 0.328125, 'reward_std': 0.26932865381240845, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.019975006580352783, 'sampling/sampling_logp_difference/max': 4.929899215698242, 'sampling/importance_sampling_ratio/min': 0.007227231748402119, 'sampling/importance_sampling_ratio/mean': 1.000014305114746, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9633238166570663, 'clip_ratio/low_mean': 3.5416796038134635e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 1.2957127637491794e-06, 'clip_ratio/high_max': 5.182851054996718e-06, 'clip_ratio/region_mean': 3.6712508745040395e-05, 'epoch': 0.48}
+
+ 51%|█████     | 519/1024 [18:49<1:04:56,  7.72s/it][AINFO 12-02 17:54:25 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 17:54:25 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 17:54:26 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 17:54:26 [block_pool.py:292] Successfully reset prefix cache
+
+ 51%|█████     | 520/1024 [21:37<1:34:32, 11.26s/it][A
+                                                    [A{'loss': 0.079, 'grad_norm': 0.0021614902652800083, 'learning_rate': 1e-05, 'num_tokens': 437728081.0, 'completions/mean_length': 6568.8359375, 'completions/min_length': 101.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 6333.2724609375, 'completions/min_terminated_length': 101.0, 'completions/max_terminated_length': 16277.0, 'rewards/accuracy_reward/mean': 0.4140625, 'rewards/accuracy_reward/std': 0.49449479579925537, 'reward': 0.4140625, 'reward_std': 0.24487745761871338, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.019699109718203545, 'sampling/sampling_logp_difference/max': 6.582705020904541, 'sampling/importance_sampling_ratio/min': 0.001384100178256631, 'sampling/importance_sampling_ratio/mean': 0.9999655485153198, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9648878574371338, 'clip_ratio/low_mean': 3.3715954828039685e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 4.584388079820201e-06, 'clip_ratio/high_max': 1.8337552319280803e-05, 'clip_ratio/region_mean': 3.8300342453112535e-05, 'epoch': 0.48}
+
+ 51%|█████     | 520/1024 [21:37<1:34:32, 11.26s/it][AINFO 12-02 17:57:14 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 17:57:14 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 17:57:14 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 17:57:14 [block_pool.py:292] Successfully reset prefix cache
+
+ 51%|█████     | 521/1024 [24:33<2:16:39, 16.30s/it][A
+                                                    [A{'loss': 0.0262, 'grad_norm': 0.002030634554103017, 'learning_rate': 1e-05, 'num_tokens': 438605294.0, 'completions/mean_length': 6709.7265625, 'completions/min_length': 520.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.046875, 'completions/mean_terminated_length': 6233.9423828125, 'completions/min_terminated_length': 520.0, 'completions/max_terminated_length': 16261.0, 'rewards/accuracy_reward/mean': 0.5, 'rewards/accuracy_reward/std': 0.5019646286964417, 'reward': 0.5, 'reward_std': 0.2435920089483261, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.01805954799056053, 'sampling/sampling_logp_difference/max': 4.624210357666016, 'sampling/importance_sampling_ratio/min': 0.00981139950454235, 'sampling/importance_sampling_ratio/mean': 0.9999866485595703, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.791545994579792, 'clip_ratio/low_mean': 5.360748559724016e-06, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 4.935241918246902e-06, 'clip_ratio/high_max': 1.9740967672987608e-05, 'clip_ratio/region_mean': 1.0295990477970918e-05, 'epoch': 0.48}
+
+ 51%|█████     | 521/1024 [24:33<2:16:39, 16.30s/it][AINFO 12-02 18:00:10 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 18:00:10 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 18:00:10 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 18:00:10 [block_pool.py:292] Successfully reset prefix cache
+
+ 51%|█████     | 522/1024 [27:35<3:14:11, 23.21s/it][A
+                                                    [A{'loss': 0.0634, 'grad_norm': 0.0014125843299552798, 'learning_rate': 1e-05, 'num_tokens': 439462971.0, 'completions/mean_length': 6556.9140625, 'completions/min_length': 4.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 6400.9287109375, 'completions/min_terminated_length': 4.0, 'completions/max_terminated_length': 16354.0, 'rewards/accuracy_reward/mean': 0.4921875, 'rewards/accuracy_reward/std': 0.5019033551216125, 'reward': 0.4921875, 'reward_std': 0.3158818185329437, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.018355879932641983, 'sampling/sampling_logp_difference/max': 17.18090057373047, 'sampling/importance_sampling_ratio/min': 3.454853825246573e-08, 'sampling/importance_sampling_ratio/mean': 0.999947726726532, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.886083297431469, 'clip_ratio/low_mean': 2.266609857315416e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 1.9159130602020014e-06, 'clip_ratio/high_max': 7.663652240808005e-06, 'clip_ratio/region_mean': 2.4582011747042998e-05, 'epoch': 0.48}
+
+ 51%|█████     | 522/1024 [27:35<3:14:11, 23.21s/it][AINFO 12-02 18:03:11 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 18:03:11 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 18:03:11 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 18:03:11 [block_pool.py:292] Successfully reset prefix cache
+
+ 51%|█████     | 523/1024 [30:06<4:14:08, 30.44s/it][A
+                                                    [A{'loss': 0.0204, 'grad_norm': 0.00171169254463166, 'learning_rate': 1e-05, 'num_tokens': 440268882.0, 'completions/mean_length': 6152.4921875, 'completions/min_length': 2.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 6071.92919921875, 'completions/min_terminated_length': 2.0, 'completions/max_terminated_length': 16161.0, 'rewards/accuracy_reward/mean': 0.484375, 'rewards/accuracy_reward/std': 0.5017194747924805, 'reward': 0.484375, 'reward_std': 0.250127375125885, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.018800247460603714, 'sampling/sampling_logp_difference/max': 3.2767605781555176, 'sampling/importance_sampling_ratio/min': 0.03775034472346306, 'sampling/importance_sampling_ratio/mean': 0.99989914894104, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9536242336034775, 'clip_ratio/low_mean': 3.2019113405112876e-05, 'clip_ratio/low_min': 4.055676527059404e-06, 'clip_ratio/high_mean': 2.296614070473879e-06, 'clip_ratio/high_max': 9.186456281895516e-06, 'clip_ratio/region_mean': 3.431572758927359e-05, 'epoch': 0.48}
+
+ 51%|█████     | 523/1024 [30:06<4:14:08, 30.44s/it][AINFO 12-02 18:05:43 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 18:05:43 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 18:05:43 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 18:05:43 [block_pool.py:292] Successfully reset prefix cache
+
+ 51%|█████     | 524/1024 [32:28<5:23:12, 38.79s/it][A
+                                                    [A{'loss': 0.1033, 'grad_norm': 0.002236112719401717, 'learning_rate': 1e-05, 'num_tokens': 441020904.0, 'completions/mean_length': 5740.796875, 'completions/min_length': 731.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 5656.9921875, 'completions/min_terminated_length': 731.0, 'completions/max_terminated_length': 15578.0, 'rewards/accuracy_reward/mean': 0.5078125, 'rewards/accuracy_reward/std': 0.5019033551216125, 'reward': 0.5078125, 'reward_std': 0.34353315830230713, 'frac_reward_zero_std': 0.125, 'sampling/sampling_logp_difference/mean': 0.018967337906360626, 'sampling/sampling_logp_difference/max': 2.378675699234009, 'sampling/importance_sampling_ratio/min': 0.09267321974039078, 'sampling/importance_sampling_ratio/mean': 0.999980628490448, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9311753436923027, 'clip_ratio/low_mean': 4.899439159089525e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 2.1835992356500356e-06, 'clip_ratio/high_max': 8.734396942600142e-06, 'clip_ratio/region_mean': 5.117799059917161e-05, 'epoch': 0.48}
+
+ 51%|█████     | 524/1024 [32:28<5:23:12, 38.79s/it][AINFO 12-02 18:08:05 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 18:08:05 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 18:08:05 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 18:08:05 [block_pool.py:292] Successfully reset prefix cache
+
+ 51%|█████▏    | 525/1024 [35:16<7:05:21, 51.14s/it][A
+                                                    [A{'loss': 0.0173, 'grad_norm': 0.0011770959245041013, 'learning_rate': 1e-05, 'num_tokens': 441970986.0, 'completions/mean_length': 7279.078125, 'completions/min_length': 303.0, 'completions/max_length': 15338.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 7279.078125, 'completions/min_terminated_length': 303.0, 'completions/max_terminated_length': 15338.0, 'rewards/accuracy_reward/mean': 0.3515625, 'rewards/accuracy_reward/std': 0.4793342351913452, 'reward': 0.3515625, 'reward_std': 0.2382800281047821, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.02134837955236435, 'sampling/sampling_logp_difference/max': 11.367501258850098, 'sampling/importance_sampling_ratio/min': 1.1565300155780278e-05, 'sampling/importance_sampling_ratio/mean': 0.9999333620071411, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.170717716217041, 'clip_ratio/low_mean': 1.9387059296605003e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 0.0, 'clip_ratio/high_max': 0.0, 'clip_ratio/region_mean': 1.9387059296605003e-05, 'epoch': 0.48}
+
+ 51%|█████▏    | 525/1024 [35:16<7:05:21, 51.14s/it][AINFO 12-02 18:10:52 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 18:10:52 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 18:10:52 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 18:10:52 [block_pool.py:292] Successfully reset prefix cache
+
+ 51%|█████▏    | 526/1024 [38:01<8:59:17, 64.97s/it][A
+                                                    [A{'loss': 0.0362, 'grad_norm': 0.002001611515879631, 'learning_rate': 1e-05, 'num_tokens': 442936808.0, 'completions/mean_length': 7376.796875, 'completions/min_length': 112.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 7233.82568359375, 'completions/min_terminated_length': 112.0, 'completions/max_terminated_length': 15485.0, 'rewards/accuracy_reward/mean': 0.4453125, 'rewards/accuracy_reward/std': 0.4989531338214874, 'reward': 0.4453125, 'reward_std': 0.33220988512039185, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.02169732004404068, 'sampling/sampling_logp_difference/max': 5.697751998901367, 'sampling/importance_sampling_ratio/min': 0.003353495616465807, 'sampling/importance_sampling_ratio/mean': 0.9999234676361084, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.0409907028079033, 'clip_ratio/low_mean': 3.5013973274544696e-05, 'clip_ratio/low_min': 4.0234326661447994e-06, 'clip_ratio/high_mean': 4.595597602019552e-06, 'clip_ratio/high_max': 1.838239040807821e-05, 'clip_ratio/region_mean': 3.960957087656425e-05, 'epoch': 0.48}
+
+ 51%|█████▏    | 526/1024 [38:01<8:59:17, 64.97s/it][AINFO 12-02 18:13:38 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 18:13:38 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 18:13:38 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 18:13:38 [block_pool.py:292] Successfully reset prefix cache
+
+ 51%|█████▏    | 527/1024 [40:42<10:55:17, 79.11s/it][A
+                                                     [A{'loss': 0.021, 'grad_norm': 0.0023104713764041662, 'learning_rate': 1e-05, 'num_tokens': 443843010.0, 'completions/mean_length': 6904.515625, 'completions/min_length': 1159.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 6829.8740234375, 'completions/min_terminated_length': 1159.0, 'completions/max_terminated_length': 15883.0, 'rewards/accuracy_reward/mean': 0.3515625, 'rewards/accuracy_reward/std': 0.4793342351913452, 'reward': 0.3515625, 'reward_std': 0.226732075214386, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.020169749855995178, 'sampling/sampling_logp_difference/max': 6.179650783538818, 'sampling/importance_sampling_ratio/min': 0.0020711510442197323, 'sampling/importance_sampling_ratio/mean': 0.9999308586120605, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9905650988221169, 'clip_ratio/low_mean': 3.393580459487566e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 0.0, 'clip_ratio/high_max': 0.0, 'clip_ratio/region_mean': 3.393580459487566e-05, 'epoch': 0.48}
+
+ 51%|█████▏    | 527/1024 [40:42<10:55:17, 79.11s/it][AINFO 12-02 18:16:19 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 18:16:19 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 18:16:19 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 18:16:19 [block_pool.py:292] Successfully reset prefix cache
+
+ 52%|█████▏    | 528/1024 [43:09<12:30:43, 90.81s/it][A
+                                                     [A{'loss': 0.0606, 'grad_norm': 0.0032739758025854826, 'learning_rate': 1e-05, 'num_tokens': 444709854.0, 'completions/mean_length': 6611.21875, 'completions/min_length': 11.0, 'completions/max_length': 15035.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 6611.21875, 'completions/min_terminated_length': 11.0, 'completions/max_terminated_length': 15035.0, 'rewards/accuracy_reward/mean': 0.4140625, 'rewards/accuracy_reward/std': 0.49449479579925537, 'reward': 0.4140625, 'reward_std': 0.30327799916267395, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.018246350809931755, 'sampling/sampling_logp_difference/max': 8.124988555908203, 'sampling/importance_sampling_ratio/min': 0.00029604812152683735, 'sampling/importance_sampling_ratio/mean': 0.999891996383667, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8890361413359642, 'clip_ratio/low_mean': 3.806211361734313e-05, 'clip_ratio/low_min': 4.1808816604316235e-06, 'clip_ratio/high_mean': 8.185342608157953e-07, 'clip_ratio/high_max': 3.274137043263181e-06, 'clip_ratio/region_mean': 3.8880647935002344e-05, 'epoch': 0.49}
+
+ 52%|█████▏    | 528/1024 [43:09<12:30:43, 90.81s/it][AINFO 12-02 18:18:45 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 18:18:45 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 18:18:45 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 18:18:45 [block_pool.py:292] Successfully reset prefix cache
+
+ 52%|█████▏    | 529/1024 [45:43<14:13:42, 103.48s/it][A
+                                                      [A{'loss': 0.0528, 'grad_norm': 0.0018802061676979065, 'learning_rate': 1e-05, 'num_tokens': 445614284.0, 'completions/mean_length': 6928.296875, 'completions/min_length': 772.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 6853.84228515625, 'completions/min_terminated_length': 772.0, 'completions/max_terminated_length': 15999.0, 'rewards/accuracy_reward/mean': 0.4921875, 'rewards/accuracy_reward/std': 0.5019033551216125, 'reward': 0.4921875, 'reward_std': 0.2619747221469879, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.019618764519691467, 'sampling/sampling_logp_difference/max': 3.895602226257324, 'sampling/importance_sampling_ratio/min': 0.02033112570643425, 'sampling/importance_sampling_ratio/mean': 0.9999129176139832, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9614408612251282, 'clip_ratio/low_mean': 3.133989605430543e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 0.0, 'clip_ratio/high_max': 0.0, 'clip_ratio/region_mean': 3.133989605430543e-05, 'epoch': 0.49}
+
+ 52%|█████▏    | 529/1024 [45:43<14:13:42, 103.48s/it][AINFO 12-02 18:21:20 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 18:21:20 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 18:21:20 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 18:21:20 [block_pool.py:292] Successfully reset prefix cache
+
+ 52%|█████▏    | 530/1024 [48:42<16:28:55, 120.11s/it][A
+                                                      [A{'loss': 0.0192, 'grad_norm': 0.002164191100746393, 'learning_rate': 1e-05, 'num_tokens': 446649731.0, 'completions/mean_length': 7946.8671875, 'completions/min_length': 540.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 7812.94482421875, 'completions/min_terminated_length': 540.0, 'completions/max_terminated_length': 16194.0, 'rewards/accuracy_reward/mean': 0.453125, 'rewards/accuracy_reward/std': 0.4997538626194, 'reward': 0.453125, 'reward_std': 0.22119548916816711, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.020579926669597626, 'sampling/sampling_logp_difference/max': 6.291524410247803, 'sampling/importance_sampling_ratio/min': 0.0018519347067922354, 'sampling/importance_sampling_ratio/mean': 0.9999844431877136, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9987246319651604, 'clip_ratio/low_mean': 1.9743174675568298e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 0.0, 'clip_ratio/high_max': 0.0, 'clip_ratio/region_mean': 1.9743174675568298e-05, 'epoch': 0.49}
+
+ 52%|█████▏    | 530/1024 [48:42<16:28:55, 120.11s/it][AINFO 12-02 18:24:18 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 18:24:18 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 18:24:18 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 18:24:18 [block_pool.py:292] Successfully reset prefix cache
+
+ 52%|█████▏    | 531/1024 [51:24<17:50:22, 130.27s/it][A
+                                                      [A{'loss': 0.0375, 'grad_norm': 0.002090689493343234, 'learning_rate': 1e-05, 'num_tokens': 447536311.0, 'completions/mean_length': 6763.53125, 'completions/min_length': 834.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 6532.64013671875, 'completions/min_terminated_length': 834.0, 'completions/max_terminated_length': 14446.0, 'rewards/accuracy_reward/mean': 0.3515625, 'rewards/accuracy_reward/std': 0.4793342351913452, 'reward': 0.3515625, 'reward_std': 0.251188188791275, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.019683964550495148, 'sampling/sampling_logp_difference/max': 4.223954677581787, 'sampling/importance_sampling_ratio/min': 0.014640630222856998, 'sampling/importance_sampling_ratio/mean': 0.9999126195907593, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9593042582273483, 'clip_ratio/low_mean': 2.4596658477094024e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 0.0, 'clip_ratio/high_max': 0.0, 'clip_ratio/region_mean': 2.4596658477094024e-05, 'epoch': 0.49}
+
+ 52%|█████▏    | 531/1024 [51:24<17:50:22, 130.27s/it][AINFO 12-02 18:27:01 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 18:27:01 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 18:27:01 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 18:27:01 [block_pool.py:292] Successfully reset prefix cache
+
+ 52%|█████▏    | 532/1024 [54:31<19:45:51, 144.62s/it][A
+                                                      [A{'loss': 0.0288, 'grad_norm': 0.002562359906733036, 'learning_rate': 1e-05, 'num_tokens': 448505707.0, 'completions/mean_length': 7394.40625, 'completions/min_length': 191.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 7323.6220703125, 'completions/min_terminated_length': 191.0, 'completions/max_terminated_length': 15805.0, 'rewards/accuracy_reward/mean': 0.2578125, 'rewards/accuracy_reward/std': 0.43914902210235596, 'reward': 0.2578125, 'reward_std': 0.17123225331306458, 'frac_reward_zero_std': 0.625, 'sampling/sampling_logp_difference/mean': 0.020989736542105675, 'sampling/sampling_logp_difference/max': 8.221636772155762, 'sampling/importance_sampling_ratio/min': 0.0002687747764866799, 'sampling/importance_sampling_ratio/mean': 0.9999560713768005, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.0184528306126595, 'clip_ratio/low_mean': 2.527509309402376e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 0.0, 'clip_ratio/high_max': 0.0, 'clip_ratio/region_mean': 2.527509309402376e-05, 'epoch': 0.49}
+
+ 52%|█████▏    | 532/1024 [54:31<19:45:51, 144.62s/it][AINFO 12-02 18:30:07 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 18:30:07 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 18:30:07 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 18:30:07 [block_pool.py:292] Successfully reset prefix cache
+
+ 52%|█████▏    | 533/1024 [57:38<21:17:36, 156.12s/it][A
+                                                      [A{'loss': 0.0153, 'grad_norm': 0.0017445285338908434, 'learning_rate': 1e-05, 'num_tokens': 449443709.0, 'completions/mean_length': 7196.328125, 'completions/min_length': 419.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0390625, 'completions/mean_terminated_length': 6822.84521484375, 'completions/min_terminated_length': 419.0, 'completions/max_terminated_length': 16360.0, 'rewards/accuracy_reward/mean': 0.296875, 'rewards/accuracy_reward/std': 0.45867621898651123, 'reward': 0.296875, 'reward_std': 0.21436558663845062, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.021059826016426086, 'sampling/sampling_logp_difference/max': 6.656659126281738, 'sampling/importance_sampling_ratio/min': 0.0012854337692260742, 'sampling/importance_sampling_ratio/mean': 0.9999077320098877, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.0106298848986626, 'clip_ratio/low_mean': 2.0207754744205886e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 1.1930212622246472e-06, 'clip_ratio/high_max': 4.772085048898589e-06, 'clip_ratio/region_mean': 2.140077623380421e-05, 'epoch': 0.49}
+
+ 52%|█████▏    | 533/1024 [57:38<21:17:36, 156.12s/it][AINFO 12-02 18:33:15 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 18:33:15 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 18:33:15 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 18:33:15 [block_pool.py:292] Successfully reset prefix cache
+
+ 52%|█████▏    | 534/1024 [1:00:30<21:49:28, 160.34s/it][A
+                                                        [A{'loss': 0.0602, 'grad_norm': 0.0015701872762292624, 'learning_rate': 1e-05, 'num_tokens': 450412866.0, 'completions/mean_length': 7418.3515625, 'completions/min_length': 1445.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 7203.17626953125, 'completions/min_terminated_length': 1445.0, 'completions/max_terminated_length': 16118.0, 'rewards/accuracy_reward/mean': 0.328125, 'rewards/accuracy_reward/std': 0.4713755249977112, 'reward': 0.328125, 'reward_std': 0.2987973093986511, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.020907817408442497, 'sampling/sampling_logp_difference/max': 9.409852027893066, 'sampling/importance_sampling_ratio/min': 8.191307279048488e-05, 'sampling/importance_sampling_ratio/mean': 0.9999527335166931, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.002836562693119, 'clip_ratio/low_mean': 4.0835892605173285e-05, 'clip_ratio/low_min': 3.619411700128694e-06, 'clip_ratio/high_mean': 0.0, 'clip_ratio/high_max': 0.0, 'clip_ratio/region_mean': 4.0835892605173285e-05, 'epoch': 0.49}
+
+ 52%|█████▏    | 534/1024 [1:00:30<21:49:28, 160.34s/it][AINFO 12-02 18:36:06 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 18:36:06 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 18:36:06 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 18:36:06 [block_pool.py:292] Successfully reset prefix cache
+
+ 52%|█████▏    | 535/1024 [1:03:31<22:36:16, 166.41s/it][A
+                                                        [A{'loss': 0.055, 'grad_norm': 0.003912154585123062, 'learning_rate': 1e-05, 'num_tokens': 451331560.0, 'completions/mean_length': 7007.109375, 'completions/min_length': 312.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 6782.064453125, 'completions/min_terminated_length': 312.0, 'completions/max_terminated_length': 14089.0, 'rewards/accuracy_reward/mean': 0.453125, 'rewards/accuracy_reward/std': 0.4997538626194, 'reward': 0.453125, 'reward_std': 0.25354722142219543, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.026503996923565865, 'sampling/sampling_logp_difference/max': 15.999472618103027, 'sampling/importance_sampling_ratio/min': 1.125945416902141e-07, 'sampling/importance_sampling_ratio/mean': 0.9994460344314575, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9748141467571259, 'clip_ratio/low_mean': 9.472978547364619e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 4.761823504395579e-06, 'clip_ratio/high_max': 1.0691738907553372e-05, 'clip_ratio/region_mean': 9.949160914857202e-05, 'epoch': 0.49}
+
+ 52%|█████▏    | 535/1024 [1:03:31<22:36:16, 166.41s/it][AINFO 12-02 18:39:08 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 18:39:08 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 18:39:08 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 18:39:08 [block_pool.py:292] Successfully reset prefix cache
+
+ 52%|█████▏    | 536/1024 [1:06:27<22:55:36, 169.13s/it][A
+                                                        [A{'loss': 0.0195, 'grad_norm': 0.0029959778767079115, 'learning_rate': 1e-05, 'num_tokens': 452197568.0, 'completions/mean_length': 6605.5, 'completions/min_length': 581.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 6290.064453125, 'completions/min_terminated_length': 581.0, 'completions/max_terminated_length': 15336.0, 'rewards/accuracy_reward/mean': 0.46875, 'rewards/accuracy_reward/std': 0.5009832978248596, 'reward': 0.46875, 'reward_std': 0.3180162310600281, 'frac_reward_zero_std': 0.1875, 'sampling/sampling_logp_difference/mean': 0.019547434523701668, 'sampling/sampling_logp_difference/max': 7.566179275512695, 'sampling/importance_sampling_ratio/min': 0.0005176665727049112, 'sampling/importance_sampling_ratio/mean': 0.9998653531074524, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9742915332317352, 'clip_ratio/low_mean': 3.870478303724667e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 3.793347559621907e-06, 'clip_ratio/high_max': 1.5173390238487627e-05, 'clip_ratio/region_mean': 4.249813082424225e-05, 'epoch': 0.49}
+
+ 52%|█████▏    | 536/1024 [1:06:27<22:55:36, 169.13s/it][AINFO 12-02 18:42:04 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 18:42:04 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 18:42:04 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 18:42:04 [block_pool.py:292] Successfully reset prefix cache
+
+ 52%|█████▏    | 537/1024 [1:09:08<22:33:02, 166.70s/it][A
+                                                        [A{'loss': 0.064, 'grad_norm': 0.0027553467079997063, 'learning_rate': 1e-05, 'num_tokens': 452995762.0, 'completions/mean_length': 6091.828125, 'completions/min_length': 474.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 6010.78759765625, 'completions/min_terminated_length': 474.0, 'completions/max_terminated_length': 16158.0, 'rewards/accuracy_reward/mean': 0.3671875, 'rewards/accuracy_reward/std': 0.4839322865009308, 'reward': 0.3671875, 'reward_std': 0.22437798976898193, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.02093922719359398, 'sampling/sampling_logp_difference/max': 17.80914878845215, 'sampling/importance_sampling_ratio/min': 1.8432530168865924e-08, 'sampling/importance_sampling_ratio/mean': 1.0000625848770142, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9893068373203278, 'clip_ratio/low_mean': 3.348358245602867e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 1.0584967640170362e-06, 'clip_ratio/high_max': 4.233987056068145e-06, 'clip_ratio/region_mean': 3.454207922004571e-05, 'epoch': 0.49}
+
+ 52%|█████▏    | 537/1024 [1:09:08<22:33:02, 166.70s/it][AINFO 12-02 18:44:45 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 18:44:45 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 18:44:45 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 18:44:45 [block_pool.py:292] Successfully reset prefix cache
+
+ 53%|█████▎    | 538/1024 [1:11:50<22:20:13, 165.46s/it][A
+                                                        [A{'loss': 0.0199, 'grad_norm': 0.002985693048685789, 'learning_rate': 1e-05, 'num_tokens': 453896300.0, 'completions/mean_length': 6864.578125, 'completions/min_length': 656.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 6789.6220703125, 'completions/min_terminated_length': 656.0, 'completions/max_terminated_length': 16080.0, 'rewards/accuracy_reward/mean': 0.3828125, 'rewards/accuracy_reward/std': 0.4879830479621887, 'reward': 0.3828125, 'reward_std': 0.2869499623775482, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.019800683483481407, 'sampling/sampling_logp_difference/max': 10.874801635742188, 'sampling/importance_sampling_ratio/min': 1.8929262296296656e-05, 'sampling/importance_sampling_ratio/mean': 0.999870777130127, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.005393773317337, 'clip_ratio/low_mean': 5.346296995867306e-05, 'clip_ratio/low_min': 5.110593065182911e-06, 'clip_ratio/high_mean': 7.481887735139026e-06, 'clip_ratio/high_max': 2.9927550940556102e-05, 'clip_ratio/region_mean': 6.094485820540285e-05, 'epoch': 0.49}
+
+ 53%|█████▎    | 538/1024 [1:11:50<22:20:13, 165.46s/it][AINFO 12-02 18:47:27 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 18:47:27 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 18:47:27 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 18:47:27 [block_pool.py:292] Successfully reset prefix cache
+
+ 53%|█████▎    | 539/1024 [1:14:11<21:18:52, 158.21s/it][A
+                                                        [A{'loss': 0.0539, 'grad_norm': 0.003145795315504074, 'learning_rate': 1e-05, 'num_tokens': 454661564.0, 'completions/mean_length': 5828.125, 'completions/min_length': 506.0, 'completions/max_length': 14651.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 5828.125, 'completions/min_terminated_length': 506.0, 'completions/max_terminated_length': 14651.0, 'rewards/accuracy_reward/mean': 0.359375, 'rewards/accuracy_reward/std': 0.481702595949173, 'reward': 0.359375, 'reward_std': 0.24670752882957458, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.019065624102950096, 'sampling/sampling_logp_difference/max': 12.140581130981445, 'sampling/importance_sampling_ratio/min': 5.3384183047455736e-06, 'sampling/importance_sampling_ratio/mean': 0.9999277591705322, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.909324087202549, 'clip_ratio/low_mean': 2.9533587621699553e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 4.037869075546041e-06, 'clip_ratio/high_max': 1.2092638826288749e-05, 'clip_ratio/region_mean': 3.3571456697245594e-05, 'epoch': 0.5}
+
+ 53%|█████▎    | 539/1024 [1:14:11<21:18:52, 158.21s/it][AINFO 12-02 18:49:48 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 18:49:48 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 18:49:48 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 18:49:48 [block_pool.py:292] Successfully reset prefix cache
+
+ 53%|█████▎    | 540/1024 [1:16:51<21:20:26, 158.73s/it][A
+                                                        [A{'loss': 0.116, 'grad_norm': 0.0026711132377386093, 'learning_rate': 1e-05, 'num_tokens': 455477577.0, 'completions/mean_length': 6213.4140625, 'completions/min_length': 11.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 6051.9765625, 'completions/min_terminated_length': 11.0, 'completions/max_terminated_length': 15274.0, 'rewards/accuracy_reward/mean': 0.4296875, 'rewards/accuracy_reward/std': 0.4969765841960907, 'reward': 0.4296875, 'reward_std': 0.28930407762527466, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.01995767280459404, 'sampling/sampling_logp_difference/max': 7.793475151062012, 'sampling/importance_sampling_ratio/min': 0.00041241716826334596, 'sampling/importance_sampling_ratio/mean': 1.0000125169754028, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9570266529917717, 'clip_ratio/low_mean': 4.5576647153211525e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 5.86170045835388e-06, 'clip_ratio/high_max': 2.344680183341552e-05, 'clip_ratio/region_mean': 5.143834823684301e-05, 'epoch': 0.5}
+
+ 53%|█████▎    | 540/1024 [1:16:51<21:20:26, 158.73s/it][AINFO 12-02 18:52:28 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 18:52:28 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 18:52:28 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 18:52:28 [block_pool.py:292] Successfully reset prefix cache
+
+ 53%|█████▎    | 541/1024 [1:19:32<21:23:06, 159.39s/it][A
+                                                        [A{'loss': 0.1227, 'grad_norm': 0.0030442574061453342, 'learning_rate': 1e-05, 'num_tokens': 456408966.0, 'completions/mean_length': 7125.4140625, 'completions/min_length': 1374.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 7052.51171875, 'completions/min_terminated_length': 1374.0, 'completions/max_terminated_length': 15132.0, 'rewards/accuracy_reward/mean': 0.484375, 'rewards/accuracy_reward/std': 0.5017194747924805, 'reward': 0.484375, 'reward_std': 0.3816363215446472, 'frac_reward_zero_std': 0.1875, 'sampling/sampling_logp_difference/mean': 0.020200349390506744, 'sampling/sampling_logp_difference/max': 8.374993324279785, 'sampling/importance_sampling_ratio/min': 0.00023056140344124287, 'sampling/importance_sampling_ratio/mean': 0.9999846816062927, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9259644895792007, 'clip_ratio/low_mean': 3.6731302770931507e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 3.815367676907044e-06, 'clip_ratio/high_max': 1.5261470707628177e-05, 'clip_ratio/region_mean': 4.054667033415171e-05, 'epoch': 0.5}
+
+ 53%|█████▎    | 541/1024 [1:19:32<21:23:06, 159.39s/it][AINFO 12-02 18:55:09 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 18:55:09 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 18:55:09 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 18:55:09 [block_pool.py:292] Successfully reset prefix cache
+
+ 53%|█████▎    | 542/1024 [1:22:29<22:01:33, 164.51s/it][A
+                                                        [A{'loss': 0.0566, 'grad_norm': 0.002203581389039755, 'learning_rate': 1e-05, 'num_tokens': 457257011.0, 'completions/mean_length': 6472.1640625, 'completions/min_length': 80.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 6314.83349609375, 'completions/min_terminated_length': 80.0, 'completions/max_terminated_length': 15909.0, 'rewards/accuracy_reward/mean': 0.453125, 'rewards/accuracy_reward/std': 0.4997538626194, 'reward': 0.453125, 'reward_std': 0.26303553581237793, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.018575064837932587, 'sampling/sampling_logp_difference/max': 9.3635892868042, 'sampling/importance_sampling_ratio/min': 8.579161658417434e-05, 'sampling/importance_sampling_ratio/mean': 0.9998785853385925, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8606229647994041, 'clip_ratio/low_mean': 4.665321148422663e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 0.0, 'clip_ratio/high_max': 0.0, 'clip_ratio/region_mean': 4.665321148422663e-05, 'epoch': 0.5}
+
+ 53%|█████▎    | 542/1024 [1:22:29<22:01:33, 164.51s/it][AINFO 12-02 18:58:05 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 18:58:05 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 18:58:05 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 18:58:05 [block_pool.py:292] Successfully reset prefix cache
+
+ 53%|█████▎    | 543/1024 [1:25:18<22:11:00, 166.03s/it][A
+                                                        [A{'loss': 0.0327, 'grad_norm': 0.001689116470515728, 'learning_rate': 1e-05, 'num_tokens': 458196355.0, 'completions/mean_length': 7154.0, 'completions/min_length': 920.0, 'completions/max_length': 16072.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 7154.0, 'completions/min_terminated_length': 920.0, 'completions/max_terminated_length': 16072.0, 'rewards/accuracy_reward/mean': 0.40625, 'rewards/accuracy_reward/std': 0.4930621087551117, 'reward': 0.40625, 'reward_std': 0.18543371558189392, 'frac_reward_zero_std': 0.5625, 'sampling/sampling_logp_difference/mean': 0.0198836512863636, 'sampling/sampling_logp_difference/max': 8.124930381774902, 'sampling/importance_sampling_ratio/min': 0.00029606535099446774, 'sampling/importance_sampling_ratio/mean': 0.999965250492096, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.977513425052166, 'clip_ratio/low_mean': 2.8100045369683357e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 2.9409409307845635e-06, 'clip_ratio/high_max': 1.1763763723138254e-05, 'clip_ratio/region_mean': 3.104098641415476e-05, 'epoch': 0.5}
+
+ 53%|█████▎    | 543/1024 [1:25:18<22:11:00, 166.03s/it][AINFO 12-02 19:00:55 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 19:00:55 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 19:00:55 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 19:00:55 [block_pool.py:292] Successfully reset prefix cache
+
+ 53%|█████▎    | 544/1024 [1:27:58<21:53:21, 164.17s/it][A
+                                                        [A{'loss': 0.0448, 'grad_norm': 0.001968112075701356, 'learning_rate': 1e-05, 'num_tokens': 459095320.0, 'completions/mean_length': 6878.7265625, 'completions/min_length': 727.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 6727.849609375, 'completions/min_terminated_length': 727.0, 'completions/max_terminated_length': 16003.0, 'rewards/accuracy_reward/mean': 0.4609375, 'rewards/accuracy_reward/std': 0.5004304051399231, 'reward': 0.4609375, 'reward_std': 0.30274122953414917, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.019927173852920532, 'sampling/sampling_logp_difference/max': 8.833840370178223, 'sampling/importance_sampling_ratio/min': 0.00014571755309589207, 'sampling/importance_sampling_ratio/mean': 0.9999819993972778, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9291028156876564, 'clip_ratio/low_mean': 4.075526112501393e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 2.9397220373539312e-06, 'clip_ratio/high_max': 1.1758888149415725e-05, 'clip_ratio/region_mean': 4.369498378764547e-05, 'epoch': 0.5}
+
+ 53%|█████▎    | 544/1024 [1:27:58<21:53:21, 164.17s/it][AINFO 12-02 19:03:35 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 19:03:35 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 19:03:35 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 19:03:35 [block_pool.py:292] Successfully reset prefix cache
+
+ 53%|█████▎    | 545/1024 [1:31:00<22:32:01, 169.36s/it][A
+                                                        [A{'loss': 0.0236, 'grad_norm': 0.002588641829788685, 'learning_rate': 1e-05, 'num_tokens': 460042660.0, 'completions/mean_length': 7187.96875, 'completions/min_length': 50.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 7042.00048828125, 'completions/min_terminated_length': 50.0, 'completions/max_terminated_length': 15599.0, 'rewards/accuracy_reward/mean': 0.2265625, 'rewards/accuracy_reward/std': 0.4202519655227661, 'reward': 0.2265625, 'reward_std': 0.2120065838098526, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.021826796233654022, 'sampling/sampling_logp_difference/max': 14.562429428100586, 'sampling/importance_sampling_ratio/min': 4.738242012081173e-07, 'sampling/importance_sampling_ratio/mean': 0.9998769760131836, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.1720879971981049, 'clip_ratio/low_mean': 5.1461796147123096e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 0.0, 'clip_ratio/high_max': 0.0, 'clip_ratio/region_mean': 5.1461796147123096e-05, 'epoch': 0.5}
+
+ 53%|█████▎    | 545/1024 [1:31:00<22:32:01, 169.36s/it][AINFO 12-02 19:06:36 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 19:06:36 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 19:06:36 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 19:06:36 [block_pool.py:292] Successfully reset prefix cache
+
+ 53%|█████▎    | 546/1024 [1:33:35<21:54:49, 165.04s/it][A
+                                                        [A{'loss': 0.064, 'grad_norm': 0.002047745743766427, 'learning_rate': 1e-05, 'num_tokens': 460864862.0, 'completions/mean_length': 6278.078125, 'completions/min_length': 118.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 6035.5361328125, 'completions/min_terminated_length': 118.0, 'completions/max_terminated_length': 16019.0, 'rewards/accuracy_reward/mean': 0.625, 'rewards/accuracy_reward/std': 0.4860251843929291, 'reward': 0.625, 'reward_std': 0.31694266200065613, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.017143042758107185, 'sampling/sampling_logp_difference/max': 4.461106777191162, 'sampling/importance_sampling_ratio/min': 0.011549573391675949, 'sampling/importance_sampling_ratio/mean': 0.9999461770057678, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8143310993909836, 'clip_ratio/low_mean': 4.735719005566352e-05, 'clip_ratio/low_min': 4.235134838381782e-06, 'clip_ratio/high_mean': 3.886304170919175e-06, 'clip_ratio/high_max': 1.55452166836767e-05, 'clip_ratio/region_mean': 5.1243494908703724e-05, 'epoch': 0.5}
+
+ 53%|█████▎    | 546/1024 [1:33:35<21:54:49, 165.04s/it][AINFO 12-02 19:09:11 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 19:09:11 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 19:09:11 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 19:09:11 [block_pool.py:292] Successfully reset prefix cache
+
+ 53%|█████▎    | 547/1024 [1:36:04<21:14:10, 160.27s/it][A
+                                                        [A{'loss': 0.0867, 'grad_norm': 0.002950560301542282, 'learning_rate': 1e-05, 'num_tokens': 461608471.0, 'completions/mean_length': 5664.8828125, 'completions/min_length': 777.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 5494.73828125, 'completions/min_terminated_length': 777.0, 'completions/max_terminated_length': 13575.0, 'rewards/accuracy_reward/mean': 0.625, 'rewards/accuracy_reward/std': 0.4860251843929291, 'reward': 0.625, 'reward_std': 0.2585597634315491, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.019181005656719208, 'sampling/sampling_logp_difference/max': 10.999999046325684, 'sampling/importance_sampling_ratio/min': 1.6701715139788575e-05, 'sampling/importance_sampling_ratio/mean': 0.9999043345451355, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9489249512553215, 'clip_ratio/low_mean': 6.497366200619581e-06, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 7.269977686519269e-07, 'clip_ratio/high_max': 2.9079910746077076e-06, 'clip_ratio/region_mean': 7.224363969271508e-06, 'epoch': 0.5}
+
+ 53%|█████▎    | 547/1024 [1:36:04<21:14:10, 160.27s/it][AINFO 12-02 19:11:40 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 19:11:40 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 19:11:40 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 19:11:40 [block_pool.py:292] Successfully reset prefix cache
+
+ 54%|█████▎    | 548/1024 [1:38:44<21:11:59, 160.33s/it][A
+                                                        [A{'loss': 0.0112, 'grad_norm': 0.0013792186509817839, 'learning_rate': 1e-05, 'num_tokens': 462511519.0, 'completions/mean_length': 6914.4375, 'completions/min_length': 671.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 6839.8740234375, 'completions/min_terminated_length': 671.0, 'completions/max_terminated_length': 16223.0, 'rewards/accuracy_reward/mean': 0.3671875, 'rewards/accuracy_reward/std': 0.4839322865009308, 'reward': 0.3671875, 'reward_std': 0.19674429297447205, 'frac_reward_zero_std': 0.5625, 'sampling/sampling_logp_difference/mean': 0.01967109739780426, 'sampling/sampling_logp_difference/max': 10.124999046325684, 'sampling/importance_sampling_ratio/min': 4.006533345091157e-05, 'sampling/importance_sampling_ratio/mean': 0.9999217391014099, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9416745603084564, 'clip_ratio/low_mean': 2.458288531670405e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 3.102939444943331e-06, 'clip_ratio/high_max': 1.2411757779773325e-05, 'clip_ratio/region_mean': 2.7685824761647382e-05, 'epoch': 0.5}
+
+ 54%|█████▎    | 548/1024 [1:38:44<21:11:59, 160.33s/it][AINFO 12-02 19:14:21 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 19:14:21 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 19:14:21 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 19:14:21 [block_pool.py:292] Successfully reset prefix cache
+
+ 54%|█████▎    | 549/1024 [1:41:41<21:47:49, 165.20s/it][A
+                                                        [A{'loss': 0.0562, 'grad_norm': 0.0028722358401864767, 'learning_rate': 1e-05, 'num_tokens': 463472581.0, 'completions/mean_length': 7343.296875, 'completions/min_length': 564.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 7051.6611328125, 'completions/min_terminated_length': 564.0, 'completions/max_terminated_length': 16336.0, 'rewards/accuracy_reward/mean': 0.3984375, 'rewards/accuracy_reward/std': 0.4915000796318054, 'reward': 0.3984375, 'reward_std': 0.2880156934261322, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.018903033807873726, 'sampling/sampling_logp_difference/max': 11.062175750732422, 'sampling/importance_sampling_ratio/min': 1.5694884496042505e-05, 'sampling/importance_sampling_ratio/mean': 0.9999880194664001, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.845381110906601, 'clip_ratio/low_mean': 3.393825062403266e-05, 'clip_ratio/low_min': 3.3629271456447896e-06, 'clip_ratio/high_mean': 1.8766649532153679e-06, 'clip_ratio/high_max': 7.5066598128614714e-06, 'clip_ratio/region_mean': 3.581491563409145e-05, 'epoch': 0.51}
+
+ 54%|█████▎    | 549/1024 [1:41:41<21:47:49, 165.20s/it][AINFO 12-02 19:17:17 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 19:17:17 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 19:17:17 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 19:17:17 [block_pool.py:292] Successfully reset prefix cache
+
+ 54%|█████▎    | 550/1024 [1:44:12<21:12:47, 161.11s/it][A
+                                                        [A{'loss': 0.0074, 'grad_norm': 0.002812078921124339, 'learning_rate': 1e-05, 'num_tokens': 464263709.0, 'completions/mean_length': 6022.4375, 'completions/min_length': 447.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 5940.8505859375, 'completions/min_terminated_length': 447.0, 'completions/max_terminated_length': 13716.0, 'rewards/accuracy_reward/mean': 0.421875, 'rewards/accuracy_reward/std': 0.4957992732524872, 'reward': 0.421875, 'reward_std': 0.26120057702064514, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.01863965392112732, 'sampling/sampling_logp_difference/max': 7.119734764099121, 'sampling/importance_sampling_ratio/min': 0.0008089813054539263, 'sampling/importance_sampling_ratio/mean': 1.0000264644622803, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9279188066720963, 'clip_ratio/low_mean': 2.8057194754183e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 5.505368051217374e-06, 'clip_ratio/high_max': 1.6802483287392533e-05, 'clip_ratio/region_mean': 3.356256252118328e-05, 'epoch': 0.51}
+
+ 54%|█████▎    | 550/1024 [1:44:12<21:12:47, 161.11s/it][AINFO 12-02 19:19:49 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 19:19:49 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 19:19:49 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 19:19:49 [block_pool.py:292] Successfully reset prefix cache
+
+ 54%|█████▍    | 551/1024 [1:47:17<22:04:23, 168.00s/it][A
+                                                        [A{'loss': 0.0607, 'grad_norm': 0.001776764984242618, 'learning_rate': 1e-05, 'num_tokens': 465167502.0, 'completions/mean_length': 6888.6328125, 'completions/min_length': 647.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 6813.8662109375, 'completions/min_terminated_length': 647.0, 'completions/max_terminated_length': 16333.0, 'rewards/accuracy_reward/mean': 0.3203125, 'rewards/accuracy_reward/std': 0.4684300124645233, 'reward': 0.3203125, 'reward_std': 0.2961437702178955, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.02100517973303795, 'sampling/sampling_logp_difference/max': 6.624998092651367, 'sampling/importance_sampling_ratio/min': 0.0013267829781398177, 'sampling/importance_sampling_ratio/mean': 0.999945342540741, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.0720202773809433, 'clip_ratio/low_mean': 3.587696073736879e-05, 'clip_ratio/low_min': 2.965106659758021e-06, 'clip_ratio/high_mean': 5.5325897960756265e-06, 'clip_ratio/high_max': 1.799457299966889e-05, 'clip_ratio/region_mean': 4.140955002185365e-05, 'epoch': 0.51}
+
+ 54%|█████▍    | 551/1024 [1:47:17<22:04:23, 168.00s/it][AINFO 12-02 19:22:53 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 19:22:53 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 19:22:53 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 19:22:53 [block_pool.py:292] Successfully reset prefix cache
+
+ 54%|█████▍    | 552/1024 [1:49:57<21:43:15, 165.67s/it][A
+                                                        [A{'loss': 0.0554, 'grad_norm': 0.0016460138140246272, 'learning_rate': 1e-05, 'num_tokens': 466034535.0, 'completions/mean_length': 6626.7578125, 'completions/min_length': 940.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 6549.92919921875, 'completions/min_terminated_length': 940.0, 'completions/max_terminated_length': 14283.0, 'rewards/accuracy_reward/mean': 0.5, 'rewards/accuracy_reward/std': 0.5019646286964417, 'reward': 0.5, 'reward_std': 0.27328526973724365, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.020494937896728516, 'sampling/sampling_logp_difference/max': 8.189985275268555, 'sampling/importance_sampling_ratio/min': 0.0002774179738480598, 'sampling/importance_sampling_ratio/mean': 1.0000152587890625, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9632527679204941, 'clip_ratio/low_mean': 3.568914848983695e-05, 'clip_ratio/low_min': 3.652834493550472e-06, 'clip_ratio/high_mean': 0.0, 'clip_ratio/high_max': 0.0, 'clip_ratio/region_mean': 3.568914848983695e-05, 'epoch': 0.51}
+
+ 54%|█████▍    | 552/1024 [1:49:57<21:43:15, 165.67s/it][AINFO 12-02 19:25:33 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 19:25:33 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 19:25:33 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 19:25:33 [block_pool.py:292] Successfully reset prefix cache
+
+ 54%|█████▍    | 553/1024 [1:52:44<21:44:21, 166.16s/it][A
+                                                        [A{'loss': 0.0196, 'grad_norm': 0.0019802958704531193, 'learning_rate': 1e-05, 'num_tokens': 466911965.0, 'completions/mean_length': 6685.484375, 'completions/min_length': 349.0, 'completions/max_length': 16169.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 6685.484375, 'completions/min_terminated_length': 349.0, 'completions/max_terminated_length': 16169.0, 'rewards/accuracy_reward/mean': 0.4609375, 'rewards/accuracy_reward/std': 0.5004304051399231, 'reward': 0.4609375, 'reward_std': 0.2409384697675705, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.02000512182712555, 'sampling/sampling_logp_difference/max': 6.193117141723633, 'sampling/importance_sampling_ratio/min': 0.0020434472244232893, 'sampling/importance_sampling_ratio/mean': 0.999974250793457, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9092860966920853, 'clip_ratio/low_mean': 2.4154636378170835e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 2.4527116693207063e-06, 'clip_ratio/high_max': 9.810846677282825e-06, 'clip_ratio/region_mean': 2.660734804749154e-05, 'epoch': 0.51}
+
+ 54%|█████▍    | 553/1024 [1:52:44<21:44:21, 166.16s/it][AINFO 12-02 19:28:21 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 19:28:21 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 19:28:21 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 19:28:21 [block_pool.py:292] Successfully reset prefix cache
+
+ 54%|█████▍    | 554/1024 [1:55:18<21:13:04, 162.52s/it][A
+                                                        [A{'loss': 0.0077, 'grad_norm': 0.002490658313035965, 'learning_rate': 1e-05, 'num_tokens': 467844820.0, 'completions/mean_length': 7105.1171875, 'completions/min_length': 71.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 7032.05517578125, 'completions/min_terminated_length': 71.0, 'completions/max_terminated_length': 14992.0, 'rewards/accuracy_reward/mean': 0.2578125, 'rewards/accuracy_reward/std': 0.43914902210235596, 'reward': 0.2578125, 'reward_std': 0.17123225331306458, 'frac_reward_zero_std': 0.625, 'sampling/sampling_logp_difference/mean': 0.020726388320326805, 'sampling/sampling_logp_difference/max': 14.152327537536621, 'sampling/importance_sampling_ratio/min': 7.140394586713228e-07, 'sampling/importance_sampling_ratio/mean': 0.9999809265136719, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.046683594584465, 'clip_ratio/low_mean': 2.4120176362885104e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 8.10993185496045e-07, 'clip_ratio/high_max': 3.24397274198418e-06, 'clip_ratio/region_mean': 2.493116954838115e-05, 'epoch': 0.51}
+
+ 54%|█████▍    | 554/1024 [1:55:18<21:13:04, 162.52s/it][AINFO 12-02 19:30:55 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 19:30:55 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 19:30:55 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 19:30:55 [block_pool.py:292] Successfully reset prefix cache
+
+ 54%|█████▍    | 555/1024 [1:57:53<20:53:24, 160.35s/it][A
+                                                        [A{'loss': 0.0331, 'grad_norm': 0.002886313945055008, 'learning_rate': 1e-05, 'num_tokens': 468732451.0, 'completions/mean_length': 6806.5546875, 'completions/min_length': 605.0, 'completions/max_length': 15969.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 6806.5546875, 'completions/min_terminated_length': 605.0, 'completions/max_terminated_length': 15969.0, 'rewards/accuracy_reward/mean': 0.3203125, 'rewards/accuracy_reward/std': 0.4684300124645233, 'reward': 0.3203125, 'reward_std': 0.23250603675842285, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.019308820366859436, 'sampling/sampling_logp_difference/max': 12.668462753295898, 'sampling/importance_sampling_ratio/min': 3.148883251924417e-06, 'sampling/importance_sampling_ratio/mean': 0.9999695420265198, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9514358267188072, 'clip_ratio/low_mean': 3.0303147582344536e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 0.0, 'clip_ratio/high_max': 0.0, 'clip_ratio/region_mean': 3.0303147582344536e-05, 'epoch': 0.51}
+
+ 54%|█████▍    | 555/1024 [1:57:53<20:53:24, 160.35s/it][AINFO 12-02 19:33:30 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 19:33:30 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 19:33:30 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 19:33:30 [block_pool.py:292] Successfully reset prefix cache
+
+ 54%|█████▍    | 556/1024 [2:00:40<21:05:41, 162.27s/it][A
+                                                        [A{'loss': 0.0622, 'grad_norm': 0.002731110667809844, 'learning_rate': 1e-05, 'num_tokens': 469551145.0, 'completions/mean_length': 6238.546875, 'completions/min_length': 12.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 5995.05615234375, 'completions/min_terminated_length': 12.0, 'completions/max_terminated_length': 16075.0, 'rewards/accuracy_reward/mean': 0.3671875, 'rewards/accuracy_reward/std': 0.4839322865009308, 'reward': 0.3671875, 'reward_std': 0.3237774670124054, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.01883235014975071, 'sampling/sampling_logp_difference/max': 7.624849796295166, 'sampling/importance_sampling_ratio/min': 0.000488168589072302, 'sampling/importance_sampling_ratio/mean': 0.9999334812164307, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9408878460526466, 'clip_ratio/low_mean': 1.485187078742456e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 0.0, 'clip_ratio/high_max': 0.0, 'clip_ratio/region_mean': 1.485187078742456e-05, 'epoch': 0.51}
+
+ 54%|█████▍    | 556/1024 [2:00:40<21:05:41, 162.27s/it][AINFO 12-02 19:36:17 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 19:36:17 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 19:36:17 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 19:36:17 [block_pool.py:292] Successfully reset prefix cache
+
+ 54%|█████▍    | 557/1024 [2:03:38<21:38:34, 166.84s/it][A
+                                                        [A{'loss': 0.004, 'grad_norm': 0.0022831051610410213, 'learning_rate': 1e-05, 'num_tokens': 470510305.0, 'completions/mean_length': 7354.5, 'completions/min_length': 119.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 7283.4013671875, 'completions/min_terminated_length': 119.0, 'completions/max_terminated_length': 16342.0, 'rewards/accuracy_reward/mean': 0.4609375, 'rewards/accuracy_reward/std': 0.5004304051399231, 'reward': 0.4609375, 'reward_std': 0.28247418999671936, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.020626772195100784, 'sampling/sampling_logp_difference/max': 8.113459587097168, 'sampling/importance_sampling_ratio/min': 0.00029948100564070046, 'sampling/importance_sampling_ratio/mean': 0.999933123588562, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9548593312501907, 'clip_ratio/low_mean': 2.5422534008612274e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 8.869274097378366e-07, 'clip_ratio/high_max': 3.5477096389513463e-06, 'clip_ratio/region_mean': 2.630946141835011e-05, 'epoch': 0.51}
+
+ 54%|█████▍    | 557/1024 [2:03:38<21:38:34, 166.84s/it][AINFO 12-02 19:39:14 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 19:39:14 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 19:39:14 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 19:39:14 [block_pool.py:292] Successfully reset prefix cache
+
+ 54%|█████▍    | 558/1024 [2:06:41<22:14:49, 171.86s/it][A
+                                                        [A{'loss': 0.0944, 'grad_norm': 0.0015396618982777, 'learning_rate': 1e-05, 'num_tokens': 471486799.0, 'completions/mean_length': 7481.421875, 'completions/min_length': 358.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0390625, 'completions/mean_terminated_length': 7119.5283203125, 'completions/min_terminated_length': 358.0, 'completions/max_terminated_length': 16222.0, 'rewards/accuracy_reward/mean': 0.34375, 'rewards/accuracy_reward/std': 0.47682511806488037, 'reward': 0.34375, 'reward_std': 0.26538968086242676, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.019920824095606804, 'sampling/sampling_logp_difference/max': 7.78102970123291, 'sampling/importance_sampling_ratio/min': 0.0004175819631200284, 'sampling/importance_sampling_ratio/mean': 0.9999397993087769, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9302244186401367, 'clip_ratio/low_mean': 4.646405352559668e-05, 'clip_ratio/low_min': 9.308073458669242e-06, 'clip_ratio/high_mean': 2.6196769908892747e-06, 'clip_ratio/high_max': 1.0478707963557099e-05, 'clip_ratio/region_mean': 4.908373023226886e-05, 'epoch': 0.51}
+
+ 54%|█████▍    | 558/1024 [2:06:41<22:14:49, 171.86s/it][AINFO 12-02 19:42:18 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 19:42:18 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 19:42:18 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 19:42:18 [block_pool.py:292] Successfully reset prefix cache
+
+ 55%|█████▍    | 559/1024 [2:09:25<21:54:01, 169.55s/it][A
+                                                        [A{'loss': 0.0653, 'grad_norm': 0.002122553065419197, 'learning_rate': 1e-05, 'num_tokens': 472443991.0, 'completions/mean_length': 7333.9375, 'completions/min_length': 575.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 7042.0, 'completions/min_terminated_length': 575.0, 'completions/max_terminated_length': 14634.0, 'rewards/accuracy_reward/mean': 0.359375, 'rewards/accuracy_reward/std': 0.481702595949173, 'reward': 0.359375, 'reward_std': 0.23356688022613525, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.020558707416057587, 'sampling/sampling_logp_difference/max': 9.073382377624512, 'sampling/importance_sampling_ratio/min': 0.00011467799777165055, 'sampling/importance_sampling_ratio/mean': 1.0000252723693848, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.0290198475122452, 'clip_ratio/low_mean': 3.139938735330361e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 3.185782986747654e-06, 'clip_ratio/high_max': 1.2743131946990616e-05, 'clip_ratio/region_mean': 3.458517039689468e-05, 'epoch': 0.51}
+
+ 55%|█████▍    | 559/1024 [2:09:25<21:54:01, 169.55s/it][AINFO 12-02 19:45:02 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 19:45:02 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 19:45:02 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 19:45:02 [block_pool.py:292] Successfully reset prefix cache
+
+ 55%|█████▍    | 560/1024 [2:12:39<22:46:47, 176.74s/it][A
+                                                        [A{'loss': 0.1309, 'grad_norm': 0.0017197602428495884, 'learning_rate': 1e-05, 'num_tokens': 473346577.0, 'completions/mean_length': 6908.953125, 'completions/min_length': 463.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 6681.55224609375, 'completions/min_terminated_length': 463.0, 'completions/max_terminated_length': 16248.0, 'rewards/accuracy_reward/mean': 0.421875, 'rewards/accuracy_reward/std': 0.4957992732524872, 'reward': 0.421875, 'reward_std': 0.31246688961982727, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.019906114786863327, 'sampling/sampling_logp_difference/max': 8.68148136138916, 'sampling/importance_sampling_ratio/min': 0.00016969948774203658, 'sampling/importance_sampling_ratio/mean': 0.9999131560325623, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9942271336913109, 'clip_ratio/low_mean': 4.716298451512557e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 7.142013544125803e-06, 'clip_ratio/high_max': 2.856805417650321e-05, 'clip_ratio/region_mean': 5.430499885505924e-05, 'epoch': 0.52}
+
+ 55%|█████▍    | 560/1024 [2:12:39<22:46:47, 176.74s/it][AINFO 12-02 19:48:15 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 19:48:15 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 19:48:15 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 19:48:15 [block_pool.py:292] Successfully reset prefix cache
+
+ 55%|█████▍    | 561/1024 [2:15:59<23:37:00, 183.63s/it][A
+                                                        [A{'loss': 0.0434, 'grad_norm': 0.001618197187781334, 'learning_rate': 1e-05, 'num_tokens': 474330663.0, 'completions/mean_length': 7543.046875, 'completions/min_length': 894.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0390625, 'completions/mean_terminated_length': 7183.658203125, 'completions/min_terminated_length': 894.0, 'completions/max_terminated_length': 16315.0, 'rewards/accuracy_reward/mean': 0.28125, 'rewards/accuracy_reward/std': 0.4513758420944214, 'reward': 0.28125, 'reward_std': 0.28353503346443176, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.019991599023342133, 'sampling/sampling_logp_difference/max': 15.356815338134766, 'sampling/importance_sampling_ratio/min': 2.1410157557966158e-07, 'sampling/importance_sampling_ratio/mean': 0.9999313950538635, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.973315916955471, 'clip_ratio/low_mean': 3.3024165190909116e-05, 'clip_ratio/low_min': 2.9529187486332376e-06, 'clip_ratio/high_mean': 7.2725478048596415e-06, 'clip_ratio/high_max': 2.4387230496358825e-05, 'clip_ratio/region_mean': 4.029671254102141e-05, 'epoch': 0.52}
+
+ 55%|█████▍    | 561/1024 [2:15:59<23:37:00, 183.63s/it][AINFO 12-02 19:51:35 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 19:51:35 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 19:51:35 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 19:51:35 [block_pool.py:292] Successfully reset prefix cache
+
+ 55%|█████▍    | 562/1024 [2:19:09<23:50:13, 185.74s/it][A
+                                                        [A{'loss': 0.0188, 'grad_norm': 0.0025291196070611477, 'learning_rate': 1e-05, 'num_tokens': 475262071.0, 'completions/mean_length': 7136.375, 'completions/min_length': 829.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 6838.064453125, 'completions/min_terminated_length': 829.0, 'completions/max_terminated_length': 15773.0, 'rewards/accuracy_reward/mean': 0.453125, 'rewards/accuracy_reward/std': 0.4997538626194, 'reward': 0.453125, 'reward_std': 0.27328526973724365, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.018726464360952377, 'sampling/sampling_logp_difference/max': 9.749961853027344, 'sampling/importance_sampling_ratio/min': 5.8296889619668946e-05, 'sampling/importance_sampling_ratio/mean': 0.9999455213546753, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8573452606797218, 'clip_ratio/low_mean': 5.2758662491214636e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 4.546317370568431e-06, 'clip_ratio/high_max': 1.8185269482273725e-05, 'clip_ratio/region_mean': 5.7304980941808026e-05, 'epoch': 0.52}
+
+ 55%|█████▍    | 562/1024 [2:19:09<23:50:13, 185.74s/it][AINFO 12-02 19:54:46 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 19:54:46 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 19:54:46 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 19:54:46 [block_pool.py:292] Successfully reset prefix cache
+
+ 55%|█████▍    | 563/1024 [2:21:55<23:00:14, 179.64s/it][A
+                                                        [A{'loss': 0.1266, 'grad_norm': 0.0018195402808487415, 'learning_rate': 1e-05, 'num_tokens': 476119385.0, 'completions/mean_length': 6538.765625, 'completions/min_length': 374.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0390625, 'completions/mean_terminated_length': 6138.552734375, 'completions/min_terminated_length': 374.0, 'completions/max_terminated_length': 16283.0, 'rewards/accuracy_reward/mean': 0.5078125, 'rewards/accuracy_reward/std': 0.5019033551216125, 'reward': 0.5078125, 'reward_std': 0.3674348294734955, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.018957480788230896, 'sampling/sampling_logp_difference/max': 5.136995792388916, 'sampling/importance_sampling_ratio/min': 0.005875314120203257, 'sampling/importance_sampling_ratio/mean': 0.9999343156814575, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8312613591551781, 'clip_ratio/low_mean': 4.3801222432193754e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 4.808350070106826e-06, 'clip_ratio/high_max': 1.9233400280427304e-05, 'clip_ratio/region_mean': 4.860957244545716e-05, 'epoch': 0.52}
+
+ 55%|█████▍    | 563/1024 [2:21:55<23:00:14, 179.64s/it][AINFO 12-02 19:57:31 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 19:57:31 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 19:57:31 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 19:57:31 [block_pool.py:292] Successfully reset prefix cache
+
+ 55%|█████▌    | 564/1024 [2:24:53<22:54:07, 179.23s/it][A
+                                                        [A{'loss': 0.077, 'grad_norm': 0.001623075339011848, 'learning_rate': 1e-05, 'num_tokens': 476995139.0, 'completions/mean_length': 6692.078125, 'completions/min_length': 938.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.078125, 'completions/mean_terminated_length': 5870.72900390625, 'completions/min_terminated_length': 938.0, 'completions/max_terminated_length': 16000.0, 'rewards/accuracy_reward/mean': 0.53125, 'rewards/accuracy_reward/std': 0.5009832978248596, 'reward': 0.53125, 'reward_std': 0.2580229938030243, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.019327864050865173, 'sampling/sampling_logp_difference/max': 8.029895782470703, 'sampling/importance_sampling_ratio/min': 0.0003255821648053825, 'sampling/importance_sampling_ratio/mean': 0.9999014139175415, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.943247564136982, 'clip_ratio/low_mean': 2.9377598366409075e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 3.574774837034056e-06, 'clip_ratio/high_max': 1.4299099348136224e-05, 'clip_ratio/region_mean': 3.295237320344313e-05, 'epoch': 0.52}
+
+ 55%|█████▌    | 564/1024 [2:24:53<22:54:07, 179.23s/it][AINFO 12-02 20:00:30 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 20:00:30 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 20:00:30 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 20:00:30 [block_pool.py:292] Successfully reset prefix cache
+
+ 55%|█████▌    | 565/1024 [2:27:59<23:07:47, 181.41s/it][A
+                                                        [A{'loss': 0.0542, 'grad_norm': 0.0034032040275633335, 'learning_rate': 1e-05, 'num_tokens': 477926583.0, 'completions/mean_length': 7118.40625, 'completions/min_length': 118.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 6896.0322265625, 'completions/min_terminated_length': 118.0, 'completions/max_terminated_length': 15957.0, 'rewards/accuracy_reward/mean': 0.359375, 'rewards/accuracy_reward/std': 0.481702595949173, 'reward': 0.359375, 'reward_std': 0.30115145444869995, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.021631836891174316, 'sampling/sampling_logp_difference/max': 7.887211322784424, 'sampling/importance_sampling_ratio/min': 0.00037551531568169594, 'sampling/importance_sampling_ratio/mean': 1.0000249147415161, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.051003873348236, 'clip_ratio/low_mean': 5.479312403622316e-05, 'clip_ratio/low_min': 8.624037718618638e-06, 'clip_ratio/high_mean': 6.369621701196593e-07, 'clip_ratio/high_max': 2.547848680478637e-06, 'clip_ratio/region_mean': 5.543008592212573e-05, 'epoch': 0.52}
+
+ 55%|█████▌    | 565/1024 [2:27:59<23:07:47, 181.41s/it][AINFO 12-02 20:03:36 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 20:03:36 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 20:03:36 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 20:03:36 [block_pool.py:292] Successfully reset prefix cache
+
+ 55%|█████▌    | 566/1024 [2:30:51<22:42:04, 178.44s/it][A
+                                                        [A{'loss': 0.0089, 'grad_norm': 0.0014035169733688235, 'learning_rate': 1e-05, 'num_tokens': 478914724.0, 'completions/mean_length': 7555.8515625, 'completions/min_length': 446.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 7415.72265625, 'completions/min_terminated_length': 446.0, 'completions/max_terminated_length': 15673.0, 'rewards/accuracy_reward/mean': 0.1875, 'rewards/accuracy_reward/std': 0.39184603095054626, 'reward': 0.1875, 'reward_std': 0.19673939049243927, 'frac_reward_zero_std': 0.5625, 'sampling/sampling_logp_difference/mean': 0.020011281594634056, 'sampling/sampling_logp_difference/max': 6.373074054718018, 'sampling/importance_sampling_ratio/min': 0.0017069041496142745, 'sampling/importance_sampling_ratio/mean': 0.9999145865440369, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9771487265825272, 'clip_ratio/low_mean': 1.506989860899921e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 9.558328883940703e-07, 'clip_ratio/high_max': 3.823331553576281e-06, 'clip_ratio/region_mean': 1.602573161108012e-05, 'epoch': 0.52}
+
+ 55%|█████▌    | 566/1024 [2:30:51<22:42:04, 178.44s/it][AINFO 12-02 20:06:28 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 20:06:28 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 20:06:28 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 20:06:28 [block_pool.py:292] Successfully reset prefix cache
+
+ 55%|█████▌    | 567/1024 [2:33:48<22:35:41, 177.99s/it][A
+                                                        [A{'loss': 0.0387, 'grad_norm': 0.0012457151897251606, 'learning_rate': 1e-05, 'num_tokens': 479766874.0, 'completions/mean_length': 6505.671875, 'completions/min_length': 638.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.046875, 'completions/mean_terminated_length': 6019.85205078125, 'completions/min_terminated_length': 638.0, 'completions/max_terminated_length': 15915.0, 'rewards/accuracy_reward/mean': 0.3984375, 'rewards/accuracy_reward/std': 0.4915000796318054, 'reward': 0.3984375, 'reward_std': 0.16781240701675415, 'frac_reward_zero_std': 0.625, 'sampling/sampling_logp_difference/mean': 0.01945749670267105, 'sampling/sampling_logp_difference/max': 16.764495849609375, 'sampling/importance_sampling_ratio/min': 5.239284206481898e-08, 'sampling/importance_sampling_ratio/mean': 0.9999585151672363, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9913810566067696, 'clip_ratio/low_mean': 2.9273888458192232e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 2.0894199224130716e-06, 'clip_ratio/high_max': 4.262138645572122e-06, 'clip_ratio/region_mean': 3.1363308380605304e-05, 'epoch': 0.52}
+
+ 55%|█████▌    | 567/1024 [2:33:48<22:35:41, 177.99s/it][AINFO 12-02 20:09:24 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 20:09:24 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 20:09:24 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 20:09:24 [block_pool.py:292] Successfully reset prefix cache
+
+ 55%|█████▌    | 568/1024 [2:36:18<21:28:59, 169.60s/it][A
+                                                        [A{'loss': 0.0336, 'grad_norm': 0.002912909025326371, 'learning_rate': 1e-05, 'num_tokens': 480644782.0, 'completions/mean_length': 6709.96875, 'completions/min_length': 7.0, 'completions/max_length': 15589.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 6709.96875, 'completions/min_terminated_length': 7.0, 'completions/max_terminated_length': 15589.0, 'rewards/accuracy_reward/mean': 0.4140625, 'rewards/accuracy_reward/std': 0.49449479579925537, 'reward': 0.4140625, 'reward_std': 0.2041109800338745, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.020628605037927628, 'sampling/sampling_logp_difference/max': 9.183414459228516, 'sampling/importance_sampling_ratio/min': 0.00010272916551912203, 'sampling/importance_sampling_ratio/mean': 1.0000216960906982, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.053658738732338, 'clip_ratio/low_mean': 3.3968740126510966e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 1.1049430668208515e-06, 'clip_ratio/high_max': 4.419772267283406e-06, 'clip_ratio/region_mean': 3.507368319333182e-05, 'epoch': 0.52}
+
+ 55%|█████▌    | 568/1024 [2:36:18<21:28:59, 169.60s/it][AINFO 12-02 20:11:55 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 20:11:55 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 20:11:55 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 20:11:55 [block_pool.py:292] Successfully reset prefix cache
+
+ 56%|█████▌    | 569/1024 [2:39:00<21:08:22, 167.26s/it][A
+                                                        [A{'loss': 0.0293, 'grad_norm': 0.0028935675509274006, 'learning_rate': 1e-05, 'num_tokens': 481525875.0, 'completions/mean_length': 6712.8515625, 'completions/min_length': 18.0, 'completions/max_length': 15677.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 6712.8515625, 'completions/min_terminated_length': 18.0, 'completions/max_terminated_length': 15677.0, 'rewards/accuracy_reward/mean': 0.328125, 'rewards/accuracy_reward/std': 0.4713755249977112, 'reward': 0.328125, 'reward_std': 0.33797892928123474, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.0201251357793808, 'sampling/sampling_logp_difference/max': 8.060677528381348, 'sampling/importance_sampling_ratio/min': 0.0003157128521706909, 'sampling/importance_sampling_ratio/mean': 0.9999656677246094, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9288468211889267, 'clip_ratio/low_mean': 7.926051148388069e-05, 'clip_ratio/low_min': 9.047379990079207e-06, 'clip_ratio/high_mean': 5.148336185811786e-06, 'clip_ratio/high_max': 1.5635781892342493e-05, 'clip_ratio/region_mean': 8.440884812443983e-05, 'epoch': 0.52}
+
+ 56%|█████▌    | 569/1024 [2:39:00<21:08:22, 167.26s/it][AINFO 12-02 20:14:36 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 20:14:36 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 20:14:36 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 20:14:36 [block_pool.py:292] Successfully reset prefix cache
+
+ 56%|█████▌    | 570/1024 [2:41:59<21:33:22, 170.93s/it][A
+                                                        [A{'loss': 0.1068, 'grad_norm': 0.0014447550056502223, 'learning_rate': 1e-05, 'num_tokens': 482498539.0, 'completions/mean_length': 7452.125, 'completions/min_length': 215.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 7164.0, 'completions/min_terminated_length': 215.0, 'completions/max_terminated_length': 16247.0, 'rewards/accuracy_reward/mean': 0.25, 'rewards/accuracy_reward/std': 0.434714138507843, 'reward': 0.25, 'reward_std': 0.3145885467529297, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.018142810091376305, 'sampling/sampling_logp_difference/max': 7.104595184326172, 'sampling/importance_sampling_ratio/min': 0.0008213221444748342, 'sampling/importance_sampling_ratio/mean': 0.9999352097511292, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8201636075973511, 'clip_ratio/low_mean': 4.98413718332813e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 2.7517803005139285e-06, 'clip_ratio/high_max': 1.1007121202055714e-05, 'clip_ratio/region_mean': 5.2593152645386e-05, 'epoch': 0.52}
+
+ 56%|█████▌    | 570/1024 [2:41:59<21:33:22, 170.93s/it][AINFO 12-02 20:17:36 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 20:17:36 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 20:17:36 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 20:17:36 [block_pool.py:292] Successfully reset prefix cache
+
+ 56%|█████▌    | 571/1024 [2:44:47<21:23:47, 170.04s/it][A
+                                                        [A{'loss': -0.0018, 'grad_norm': 0.0013929647393524647, 'learning_rate': 1e-05, 'num_tokens': 483286590.0, 'completions/mean_length': 5997.6484375, 'completions/min_length': 15.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 5915.8662109375, 'completions/min_terminated_length': 15.0, 'completions/max_terminated_length': 16242.0, 'rewards/accuracy_reward/mean': 0.421875, 'rewards/accuracy_reward/std': 0.4957992732524872, 'reward': 0.421875, 'reward_std': 0.2959064245223999, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.019294174388051033, 'sampling/sampling_logp_difference/max': 9.587113380432129, 'sampling/importance_sampling_ratio/min': 6.860717985546216e-05, 'sampling/importance_sampling_ratio/mean': 1.0000566244125366, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9595593363046646, 'clip_ratio/low_mean': 2.5241818775612046e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 8.723445716896094e-07, 'clip_ratio/high_max': 3.4893782867584378e-06, 'clip_ratio/region_mean': 2.6114163347301655e-05, 'epoch': 0.53}
+
+ 56%|█████▌    | 571/1024 [2:44:47<21:23:47, 170.04s/it][AINFO 12-02 20:20:24 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 20:20:24 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 20:20:24 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 20:20:24 [block_pool.py:292] Successfully reset prefix cache
+
+ 56%|█████▌    | 572/1024 [2:47:51<21:52:33, 174.23s/it][A
+                                                        [A{'loss': 0.0713, 'grad_norm': 0.0020693838596343994, 'learning_rate': 1e-05, 'num_tokens': 484164003.0, 'completions/mean_length': 6706.4140625, 'completions/min_length': 233.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 6474.15234375, 'completions/min_terminated_length': 233.0, 'completions/max_terminated_length': 15962.0, 'rewards/accuracy_reward/mean': 0.4296875, 'rewards/accuracy_reward/std': 0.4969765841960907, 'reward': 0.4296875, 'reward_std': 0.30744946002960205, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.01968679018318653, 'sampling/sampling_logp_difference/max': 4.505383491516113, 'sampling/importance_sampling_ratio/min': 0.011049352586269379, 'sampling/importance_sampling_ratio/mean': 0.9999852180480957, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9320398196578026, 'clip_ratio/low_mean': 3.2705364901630674e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 3.1854543180998007e-06, 'clip_ratio/high_max': 1.2741817272399203e-05, 'clip_ratio/region_mean': 3.589081939026073e-05, 'epoch': 0.53}
+
+ 56%|█████▌    | 572/1024 [2:47:51<21:52:33, 174.23s/it][AINFO 12-02 20:23:28 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 20:23:28 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 20:23:28 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 20:23:28 [block_pool.py:292] Successfully reset prefix cache
+
+ 56%|█████▌    | 573/1024 [2:50:30<21:15:20, 169.67s/it][A
+                                                        [A{'loss': 0.0528, 'grad_norm': 0.004261080641299486, 'learning_rate': 1e-05, 'num_tokens': 484864799.0, 'completions/mean_length': 5317.96875, 'completions/min_length': 344.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 5230.83447265625, 'completions/min_terminated_length': 344.0, 'completions/max_terminated_length': 15636.0, 'rewards/accuracy_reward/mean': 0.5234375, 'rewards/accuracy_reward/std': 0.5014128684997559, 'reward': 0.5234375, 'reward_std': 0.20753079652786255, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.01876065693795681, 'sampling/sampling_logp_difference/max': 8.853672981262207, 'sampling/importance_sampling_ratio/min': 0.00014285604993347079, 'sampling/importance_sampling_ratio/mean': 0.9999099969863892, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.891069769859314, 'clip_ratio/low_mean': 2.067615122314237e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 4.457511181499285e-06, 'clip_ratio/high_max': 1.783004472599714e-05, 'clip_ratio/region_mean': 2.5133662290954817e-05, 'epoch': 0.53}
+
+ 56%|█████▌    | 573/1024 [2:50:30<21:15:20, 169.67s/it][AINFO 12-02 20:26:07 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 20:26:07 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 20:26:07 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 20:26:07 [block_pool.py:292] Successfully reset prefix cache
+
+ 56%|█████▌    | 574/1024 [2:53:35<21:45:39, 174.09s/it][A
+                                                        [A{'loss': 0.0396, 'grad_norm': 0.0013973438180983067, 'learning_rate': 1e-05, 'num_tokens': 485779676.0, 'completions/mean_length': 6978.7890625, 'completions/min_length': 710.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0390625, 'completions/mean_terminated_length': 6596.46337890625, 'completions/min_terminated_length': 710.0, 'completions/max_terminated_length': 16336.0, 'rewards/accuracy_reward/mean': 0.3125, 'rewards/accuracy_reward/std': 0.4653336703777313, 'reward': 0.3125, 'reward_std': 0.2675113081932068, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.019345812499523163, 'sampling/sampling_logp_difference/max': 8.306513786315918, 'sampling/importance_sampling_ratio/min': 0.00024690330610610545, 'sampling/importance_sampling_ratio/mean': 0.9999111890792847, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9322286397218704, 'clip_ratio/low_mean': 4.1548010585756856e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 1.738632818160113e-06, 'clip_ratio/high_max': 6.954531272640452e-06, 'clip_ratio/region_mean': 4.328664340391697e-05, 'epoch': 0.53}
+
+ 56%|█████▌    | 574/1024 [2:53:35<21:45:39, 174.09s/it][AINFO 12-02 20:29:11 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 20:29:11 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 20:29:11 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 20:29:11 [block_pool.py:292] Successfully reset prefix cache
+
+ 56%|█████▌    | 575/1024 [2:56:22<21:26:45, 171.95s/it][A
+                                                        [A{'loss': 0.1013, 'grad_norm': 0.0015273626195266843, 'learning_rate': 1e-05, 'num_tokens': 486574779.0, 'completions/mean_length': 6046.4921875, 'completions/min_length': 997.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 5965.09423828125, 'completions/min_terminated_length': 997.0, 'completions/max_terminated_length': 16178.0, 'rewards/accuracy_reward/mean': 0.3046875, 'rewards/accuracy_reward/std': 0.46208351850509644, 'reward': 0.3046875, 'reward_std': 0.3345639705657959, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.01977401226758957, 'sampling/sampling_logp_difference/max': 7.7330522537231445, 'sampling/importance_sampling_ratio/min': 0.00043810487841255963, 'sampling/importance_sampling_ratio/mean': 0.9998648166656494, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.0245087146759033, 'clip_ratio/low_mean': 3.8480168882415455e-05, 'clip_ratio/low_min': 8.625057944300352e-06, 'clip_ratio/high_mean': 3.506077746351366e-06, 'clip_ratio/high_max': 1.4024310985405464e-05, 'clip_ratio/region_mean': 4.198624606033263e-05, 'epoch': 0.53}
+
+ 56%|█████▌    | 575/1024 [2:56:22<21:26:45, 171.95s/it][AINFO 12-02 20:31:58 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 20:31:58 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 20:31:58 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 20:31:58 [block_pool.py:292] Successfully reset prefix cache
+
+ 56%|█████▋    | 576/1024 [2:59:10<21:16:02, 170.90s/it][A
+                                                        [A{'loss': 0.0426, 'grad_norm': 0.003076995024457574, 'learning_rate': 1e-05, 'num_tokens': 487366590.0, 'completions/mean_length': 5987.0859375, 'completions/min_length': 848.0, 'completions/max_length': 16086.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 5987.0859375, 'completions/min_terminated_length': 848.0, 'completions/max_terminated_length': 16086.0, 'rewards/accuracy_reward/mean': 0.4453125, 'rewards/accuracy_reward/std': 0.4989531338214874, 'reward': 0.4453125, 'reward_std': 0.24830511212348938, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.018766682595014572, 'sampling/sampling_logp_difference/max': 7.659616470336914, 'sampling/importance_sampling_ratio/min': 0.0004714882234111428, 'sampling/importance_sampling_ratio/mean': 1.0000369548797607, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9373713582754135, 'clip_ratio/low_mean': 2.9637111538249883e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 2.7530613806447946e-06, 'clip_ratio/high_max': 1.1012245522579178e-05, 'clip_ratio/region_mean': 3.239017382838938e-05, 'epoch': 0.53}
+
+ 56%|█████▋    | 576/1024 [2:59:10<21:16:02, 170.90s/it][AINFO 12-02 20:34:50 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 20:34:50 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 20:34:50 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 20:34:50 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 56%|█████▋    | 577/1024 [3:02:13<21:39:27, 174.42s/it][A
+                                                        [A{'loss': 0.0769, 'grad_norm': 0.003187539055943489, 'learning_rate': 1e-05, 'num_tokens': 488319172.0, 'completions/mean_length': 7251.984375, 'completions/min_length': 483.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 6957.40283203125, 'completions/min_terminated_length': 483.0, 'completions/max_terminated_length': 14430.0, 'rewards/accuracy_reward/mean': 0.328125, 'rewards/accuracy_reward/std': 0.4713755249977112, 'reward': 0.328125, 'reward_std': 0.2398776262998581, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.01952427066862583, 'sampling/sampling_logp_difference/max': 10.74979019165039, 'sampling/importance_sampling_ratio/min': 2.144990867236629e-05, 'sampling/importance_sampling_ratio/mean': 0.9999208450317383, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9717797562479973, 'clip_ratio/low_mean': 2.589502707905922e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 2.634733505146869e-06, 'clip_ratio/high_max': 1.0538934020587476e-05, 'clip_ratio/region_mean': 2.852976138001395e-05, 'epoch': 0.53}
+
+ 56%|█████▋    | 577/1024 [3:02:13<21:39:27, 174.42s/it][AINFO 12-02 20:37:49 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 20:37:49 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 20:37:49 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 20:37:49 [block_pool.py:292] Successfully reset prefix cache
+[OpenTinker] 2025-12-02 20:39:38,146 - math_verify.grader - WARNING - Timeout during comparison
+[OpenTinker] 2025-12-02 20:39:43,220 - math_verify.grader - WARNING - Timeout during comparison
+
+ 56%|█████▋    | 578/1024 [3:05:08<21:39:14, 174.79s/it][A
+                                                        [A{'loss': 0.0425, 'grad_norm': 0.0015998799353837967, 'learning_rate': 1e-05, 'num_tokens': 489170217.0, 'completions/mean_length': 6528.8515625, 'completions/min_length': 571.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 6451.251953125, 'completions/min_terminated_length': 571.0, 'completions/max_terminated_length': 15821.0, 'rewards/accuracy_reward/mean': 0.65625, 'rewards/accuracy_reward/std': 0.47682511806488037, 'reward': 0.65625, 'reward_std': 0.304571270942688, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.017877915874123573, 'sampling/sampling_logp_difference/max': 8.874923706054688, 'sampling/importance_sampling_ratio/min': 0.0001398523017996922, 'sampling/importance_sampling_ratio/mean': 1.0000030994415283, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8000363186001778, 'clip_ratio/low_mean': 2.4101982717184e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 9.469707038078923e-06, 'clip_ratio/high_max': 3.787882815231569e-05, 'clip_ratio/region_mean': 3.3571689641576086e-05, 'epoch': 0.53}
+
+ 56%|█████▋    | 578/1024 [3:05:08<21:39:14, 174.79s/it][AINFO 12-02 20:40:45 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 20:40:45 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 20:40:45 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 20:40:45 [block_pool.py:292] Successfully reset prefix cache
+
+ 57%|█████▋    | 579/1024 [3:07:29<20:21:19, 164.67s/it][A
+                                                        [A{'loss': 0.0597, 'grad_norm': 0.002314757788553834, 'learning_rate': 1e-05, 'num_tokens': 489917906.0, 'completions/mean_length': 5685.0703125, 'completions/min_length': 9.0, 'completions/max_length': 15651.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 5685.0703125, 'completions/min_terminated_length': 9.0, 'completions/max_terminated_length': 15651.0, 'rewards/accuracy_reward/mean': 0.46875, 'rewards/accuracy_reward/std': 0.5009832978248596, 'reward': 0.46875, 'reward_std': 0.2835350036621094, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.020619021728634834, 'sampling/sampling_logp_difference/max': 12.308268547058105, 'sampling/importance_sampling_ratio/min': 4.514263309829403e-06, 'sampling/importance_sampling_ratio/mean': 0.9999635815620422, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9907611533999443, 'clip_ratio/low_mean': 4.5643309931620024e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 7.001541234785691e-06, 'clip_ratio/high_max': 2.4189836949517485e-05, 'clip_ratio/region_mean': 5.2644851166405715e-05, 'epoch': 0.53}
+
+ 57%|█████▋    | 579/1024 [3:07:29<20:21:19, 164.67s/it][AINFO 12-02 20:43:06 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 20:43:06 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 20:43:06 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 20:43:06 [block_pool.py:292] Successfully reset prefix cache
+
+ 57%|█████▋    | 580/1024 [3:10:37<21:10:32, 171.70s/it][A
+                                                        [A{'loss': 0.0167, 'grad_norm': 0.0007821820909157395, 'learning_rate': 1e-05, 'num_tokens': 491005298.0, 'completions/mean_length': 8329.9375, 'completions/min_length': 494.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 8070.12890625, 'completions/min_terminated_length': 494.0, 'completions/max_terminated_length': 15327.0, 'rewards/accuracy_reward/mean': 0.15625, 'rewards/accuracy_reward/std': 0.3645188808441162, 'reward': 0.15625, 'reward_std': 0.1462521106004715, 'frac_reward_zero_std': 0.625, 'sampling/sampling_logp_difference/mean': 0.020741768181324005, 'sampling/sampling_logp_difference/max': 5.374422073364258, 'sampling/importance_sampling_ratio/min': 0.004633596166968346, 'sampling/importance_sampling_ratio/mean': 1.000043511390686, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9830568954348564, 'clip_ratio/low_mean': 2.0602713902917458e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 1.0356598068028688e-06, 'clip_ratio/high_max': 4.142639227211475e-06, 'clip_ratio/region_mean': 2.1638373709720327e-05, 'epoch': 0.53}
+
+ 57%|█████▋    | 580/1024 [3:10:37<21:10:32, 171.70s/it][AINFO 12-02 20:46:14 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 20:46:14 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 20:46:14 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 20:46:14 [block_pool.py:292] Successfully reset prefix cache
+
+ 57%|█████▋    | 581/1024 [3:13:25<20:59:08, 170.54s/it][A
+                                                        [A{'loss': 0.0806, 'grad_norm': 0.0013309167698025703, 'learning_rate': 1e-05, 'num_tokens': 491930405.0, 'completions/mean_length': 7080.7109375, 'completions/min_length': 843.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0625, 'completions/mean_terminated_length': 6460.4921875, 'completions/min_terminated_length': 843.0, 'completions/max_terminated_length': 16380.0, 'rewards/accuracy_reward/mean': 0.46875, 'rewards/accuracy_reward/std': 0.5009832978248596, 'reward': 0.46875, 'reward_std': 0.2937847673892975, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.019711364060640335, 'sampling/sampling_logp_difference/max': 15.035537719726562, 'sampling/importance_sampling_ratio/min': 2.952221507257491e-07, 'sampling/importance_sampling_ratio/mean': 0.9999532699584961, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9614995270967484, 'clip_ratio/low_mean': 3.652423993116827e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 7.4366088824717735e-06, 'clip_ratio/high_max': 2.1265087298161234e-05, 'clip_ratio/region_mean': 4.396084796098876e-05, 'epoch': 0.53}
+
+ 57%|█████▋    | 581/1024 [3:13:25<20:59:08, 170.54s/it][AINFO 12-02 20:49:02 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 20:49:02 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 20:49:02 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 20:49:02 [block_pool.py:292] Successfully reset prefix cache
+
+ 57%|█████▋    | 582/1024 [3:15:47<19:53:09, 161.97s/it][A
+                                                        [A{'loss': 0.0047, 'grad_norm': 0.004105919040739536, 'learning_rate': 1e-05, 'num_tokens': 492580268.0, 'completions/mean_length': 4908.3046875, 'completions/min_length': 10.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 4817.94482421875, 'completions/min_terminated_length': 10.0, 'completions/max_terminated_length': 15096.0, 'rewards/accuracy_reward/mean': 0.46875, 'rewards/accuracy_reward/std': 0.5009832978248596, 'reward': 0.46875, 'reward_std': 0.30904704332351685, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.01891482062637806, 'sampling/sampling_logp_difference/max': 6.6072564125061035, 'sampling/importance_sampling_ratio/min': 0.0013505324022844434, 'sampling/importance_sampling_ratio/mean': 1.0000088214874268, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9987996220588684, 'clip_ratio/low_mean': 3.2411358688477776e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 2.4057488872131216e-06, 'clip_ratio/high_max': 9.622995548852487e-06, 'clip_ratio/region_mean': 3.48171075756909e-05, 'epoch': 0.54}
+
+ 57%|█████▋    | 582/1024 [3:15:47<19:53:09, 161.97s/it][AINFO 12-02 20:51:24 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 20:51:24 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 20:51:24 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 20:51:24 [block_pool.py:292] Successfully reset prefix cache
+
+ 57%|█████▋    | 583/1024 [3:18:09<19:06:20, 155.96s/it][A
+                                                        [A{'loss': 0.0122, 'grad_norm': 0.0023835445754230022, 'learning_rate': 1e-05, 'num_tokens': 493412089.0, 'completions/mean_length': 6354.6015625, 'completions/min_length': 138.0, 'completions/max_length': 15204.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 6354.6015625, 'completions/min_terminated_length': 138.0, 'completions/max_terminated_length': 15204.0, 'rewards/accuracy_reward/mean': 0.40625, 'rewards/accuracy_reward/std': 0.4930621087551117, 'reward': 0.40625, 'reward_std': 0.22119548916816711, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.021383190527558327, 'sampling/sampling_logp_difference/max': 8.355856895446777, 'sampling/importance_sampling_ratio/min': 0.0002350160211790353, 'sampling/importance_sampling_ratio/mean': 1.0000243186950684, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.0391495525836945, 'clip_ratio/low_mean': 4.36340567375737e-05, 'clip_ratio/low_min': 3.839302280539414e-06, 'clip_ratio/high_mean': 4.443851594260195e-06, 'clip_ratio/high_max': 1.777540637704078e-05, 'clip_ratio/region_mean': 4.8077908331833896e-05, 'epoch': 0.54}
+
+ 57%|█████▋    | 583/1024 [3:18:09<19:06:20, 155.96s/it][AINFO 12-02 20:53:46 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 20:53:46 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 20:53:46 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 20:53:46 [block_pool.py:292] Successfully reset prefix cache
+
+ 57%|█████▋    | 584/1024 [3:20:50<19:14:34, 157.44s/it][A
+                                                        [A{'loss': 0.0704, 'grad_norm': 0.003752552904188633, 'learning_rate': 1e-05, 'num_tokens': 494188990.0, 'completions/mean_length': 5936.9765625, 'completions/min_length': 566.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 5686.248046875, 'completions/min_terminated_length': 566.0, 'completions/max_terminated_length': 14365.0, 'rewards/accuracy_reward/mean': 0.5859375, 'rewards/accuracy_reward/std': 0.49449479579925537, 'reward': 0.5859375, 'reward_std': 0.29036980867385864, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.018900560215115547, 'sampling/sampling_logp_difference/max': 6.530724048614502, 'sampling/importance_sampling_ratio/min': 0.0014579498674720526, 'sampling/importance_sampling_ratio/mean': 0.9998899102210999, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9083528146147728, 'clip_ratio/low_mean': 4.246656294526474e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 1.5247621831804281e-06, 'clip_ratio/high_max': 6.0990487327217124e-06, 'clip_ratio/region_mean': 4.399132512844517e-05, 'epoch': 0.54}
+
+ 57%|█████▋    | 584/1024 [3:20:50<19:14:34, 157.44s/it][AINFO 12-02 20:56:27 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 20:56:27 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 20:56:27 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 20:56:27 [block_pool.py:292] Successfully reset prefix cache
+
+ 57%|█████▋    | 585/1024 [3:23:30<19:16:26, 158.06s/it][A
+                                                        [A{'loss': 0.0664, 'grad_norm': 0.002407103544101119, 'learning_rate': 1e-05, 'num_tokens': 495002680.0, 'completions/mean_length': 6188.640625, 'completions/min_length': 528.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 6026.81005859375, 'completions/min_terminated_length': 528.0, 'completions/max_terminated_length': 14274.0, 'rewards/accuracy_reward/mean': 0.28125, 'rewards/accuracy_reward/std': 0.4513758420944214, 'reward': 0.28125, 'reward_std': 0.34876543283462524, 'frac_reward_zero_std': 0.1875, 'sampling/sampling_logp_difference/mean': 0.02121657505631447, 'sampling/sampling_logp_difference/max': 9.499998092651367, 'sampling/importance_sampling_ratio/min': 7.485197420464829e-05, 'sampling/importance_sampling_ratio/mean': 0.9999685287475586, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.1161335483193398, 'clip_ratio/low_mean': 5.7760178151511354e-05, 'clip_ratio/low_min': 1.2930649063491728e-05, 'clip_ratio/high_mean': 1.4652100617240649e-06, 'clip_ratio/high_max': 5.8608402468962595e-06, 'clip_ratio/region_mean': 5.922538775848807e-05, 'epoch': 0.54}
+
+ 57%|█████▋    | 585/1024 [3:23:30<19:16:26, 158.06s/it][AINFO 12-02 20:59:06 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 20:59:06 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 20:59:06 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 20:59:06 [block_pool.py:292] Successfully reset prefix cache
+
+ 57%|█████▋    | 586/1024 [3:26:16<19:31:55, 160.54s/it][A
+                                                        [A{'loss': 0.057, 'grad_norm': 0.002197301248088479, 'learning_rate': 1e-05, 'num_tokens': 495970313.0, 'completions/mean_length': 7417.3828125, 'completions/min_length': 785.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 7275.05615234375, 'completions/min_terminated_length': 785.0, 'completions/max_terminated_length': 15901.0, 'rewards/accuracy_reward/mean': 0.4765625, 'rewards/accuracy_reward/std': 0.5014128684997559, 'reward': 0.4765625, 'reward_std': 0.2725111246109009, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.0190783329308033, 'sampling/sampling_logp_difference/max': 6.6439409255981445, 'sampling/importance_sampling_ratio/min': 0.0013018865138292313, 'sampling/importance_sampling_ratio/mean': 0.9999420046806335, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9426288679242134, 'clip_ratio/low_mean': 3.367023407463421e-05, 'clip_ratio/low_min': 5.107043762109242e-06, 'clip_ratio/high_mean': 5.107369474899315e-06, 'clip_ratio/high_max': 2.042947789959726e-05, 'clip_ratio/region_mean': 3.8777602981099335e-05, 'epoch': 0.54}
+
+ 57%|█████▋    | 586/1024 [3:26:16<19:31:55, 160.54s/it][AINFO 12-02 21:01:53 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 21:01:53 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 21:01:53 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 21:01:53 [block_pool.py:292] Successfully reset prefix cache
+
+ 57%|█████▋    | 587/1024 [3:28:55<19:26:31, 160.16s/it][A
+                                                        [A{'loss': 0.0048, 'grad_norm': 0.0017400799551978707, 'learning_rate': 1e-05, 'num_tokens': 496840263.0, 'completions/mean_length': 6652.796875, 'completions/min_length': 517.0, 'completions/max_length': 16060.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 6652.796875, 'completions/min_terminated_length': 517.0, 'completions/max_terminated_length': 16060.0, 'rewards/accuracy_reward/mean': 0.3671875, 'rewards/accuracy_reward/std': 0.4839322865009308, 'reward': 0.3671875, 'reward_std': 0.23068873584270477, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.02080768160521984, 'sampling/sampling_logp_difference/max': 7.967526912689209, 'sampling/importance_sampling_ratio/min': 0.0003465349436737597, 'sampling/importance_sampling_ratio/mean': 1.0000059604644775, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.0600578784942627, 'clip_ratio/low_mean': 1.7456467162446643e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 3.2656898270033707e-06, 'clip_ratio/high_max': 1.3062759308013483e-05, 'clip_ratio/region_mean': 2.0722156989450013e-05, 'epoch': 0.54}
+
+ 57%|█████▋    | 587/1024 [3:28:55<19:26:31, 160.16s/it][AINFO 12-02 21:04:32 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 21:04:32 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 21:04:32 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 21:04:32 [block_pool.py:292] Successfully reset prefix cache
+
+ 57%|█████▋    | 588/1024 [3:32:04<20:25:42, 168.67s/it][A
+                                                        [A{'loss': 0.071, 'grad_norm': 0.0014333624858409166, 'learning_rate': 1e-05, 'num_tokens': 497891216.0, 'completions/mean_length': 8063.6328125, 'completions/min_length': 470.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0546875, 'completions/mean_terminated_length': 7582.2890625, 'completions/min_terminated_length': 470.0, 'completions/max_terminated_length': 16355.0, 'rewards/accuracy_reward/mean': 0.390625, 'rewards/accuracy_reward/std': 0.4898075461387634, 'reward': 0.390625, 'reward_std': 0.3182408809661865, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.01954628899693489, 'sampling/sampling_logp_difference/max': 7.74574613571167, 'sampling/importance_sampling_ratio/min': 0.00043257875950075686, 'sampling/importance_sampling_ratio/mean': 0.9999160766601562, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9112813174724579, 'clip_ratio/low_mean': 3.980786505053402e-05, 'clip_ratio/low_min': 4.07418292525108e-06, 'clip_ratio/high_mean': 3.5600247656475403e-06, 'clip_ratio/high_max': 1.1523770353960572e-05, 'clip_ratio/region_mean': 4.336788970249472e-05, 'epoch': 0.54}
+
+ 57%|█████▋    | 588/1024 [3:32:04<20:25:42, 168.67s/it][AINFO 12-02 21:07:40 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 21:07:40 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 21:07:40 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 21:07:40 [block_pool.py:292] Successfully reset prefix cache
+
+ 58%|█████▊    | 589/1024 [3:34:51<20:19:10, 168.16s/it][A
+                                                        [A{'loss': 0.0292, 'grad_norm': 0.002002251800149679, 'learning_rate': 1e-05, 'num_tokens': 498774852.0, 'completions/mean_length': 6720.59375, 'completions/min_length': 171.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 6567.20654296875, 'completions/min_terminated_length': 171.0, 'completions/max_terminated_length': 15275.0, 'rewards/accuracy_reward/mean': 0.34375, 'rewards/accuracy_reward/std': 0.47682511806488037, 'reward': 0.34375, 'reward_std': 0.32719242572784424, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.020341092720627785, 'sampling/sampling_logp_difference/max': 3.6814346313476562, 'sampling/importance_sampling_ratio/min': 0.025186816230416298, 'sampling/importance_sampling_ratio/mean': 1.0000476837158203, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.048555426299572, 'clip_ratio/low_mean': 4.7517322173007415e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 4.287134515834623e-06, 'clip_ratio/high_max': 1.7148538063338492e-05, 'clip_ratio/region_mean': 5.180445691621571e-05, 'epoch': 0.54}
+
+ 58%|█████▊    | 589/1024 [3:34:51<20:19:10, 168.16s/it][AINFO 12-02 21:10:27 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 21:10:27 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 21:10:27 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 21:10:27 [block_pool.py:292] Successfully reset prefix cache
+
+ 58%|█████▊    | 590/1024 [3:37:46<20:30:48, 170.16s/it][A
+                                                        [A{'loss': 0.079, 'grad_norm': 0.0021725764963775873, 'learning_rate': 1e-05, 'num_tokens': 499667079.0, 'completions/mean_length': 6809.8984375, 'completions/min_length': 739.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 6657.92919921875, 'completions/min_terminated_length': 739.0, 'completions/max_terminated_length': 14734.0, 'rewards/accuracy_reward/mean': 0.4296875, 'rewards/accuracy_reward/std': 0.4969765841960907, 'reward': 0.4296875, 'reward_std': 0.3077537715435028, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.019060201942920685, 'sampling/sampling_logp_difference/max': 8.437488555908203, 'sampling/importance_sampling_ratio/min': 0.00021659342746715993, 'sampling/importance_sampling_ratio/mean': 0.9999881982803345, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.0035612359642982, 'clip_ratio/low_mean': 5.5098988696045126e-05, 'clip_ratio/low_min': 3.822045528067974e-06, 'clip_ratio/high_mean': 3.6934502531948965e-06, 'clip_ratio/high_max': 1.4773801012779586e-05, 'clip_ratio/region_mean': 5.8792439290300536e-05, 'epoch': 0.54}
+
+ 58%|█████▊    | 590/1024 [3:37:46<20:30:48, 170.16s/it][AINFO 12-02 21:13:22 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 21:13:22 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 21:13:22 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 21:13:22 [block_pool.py:292] Successfully reset prefix cache
+
+ 58%|█████▊    | 591/1024 [3:40:48<20:55:32, 173.98s/it][A
+                                                        [A{'loss': 0.0179, 'grad_norm': 0.0012358782114461064, 'learning_rate': 1e-05, 'num_tokens': 500679585.0, 'completions/mean_length': 7742.765625, 'completions/min_length': 1001.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0390625, 'completions/mean_terminated_length': 7391.49560546875, 'completions/min_terminated_length': 1001.0, 'completions/max_terminated_length': 15760.0, 'rewards/accuracy_reward/mean': 0.359375, 'rewards/accuracy_reward/std': 0.481702595949173, 'reward': 0.359375, 'reward_std': 0.248829185962677, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.020652322098612785, 'sampling/sampling_logp_difference/max': 13.068723678588867, 'sampling/importance_sampling_ratio/min': 2.1102089249325218e-06, 'sampling/importance_sampling_ratio/mean': 0.9998856782913208, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.032991111278534, 'clip_ratio/low_mean': 2.2109893563992955e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 5.899756502003584e-06, 'clip_ratio/high_max': 2.3599026008014334e-05, 'clip_ratio/region_mean': 2.800965006599654e-05, 'epoch': 0.54}
+
+ 58%|█████▊    | 591/1024 [3:40:48<20:55:32, 173.98s/it][AINFO 12-02 21:16:25 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 21:16:25 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 21:16:25 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 21:16:25 [block_pool.py:292] Successfully reset prefix cache
+
+ 58%|█████▊    | 592/1024 [3:43:38<20:44:05, 172.79s/it][A
+                                                        [A{'loss': 0.0431, 'grad_norm': 0.0018411773489788175, 'learning_rate': 1e-05, 'num_tokens': 501698114.0, 'completions/mean_length': 7788.1953125, 'completions/min_length': 68.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 7510.9111328125, 'completions/min_terminated_length': 68.0, 'completions/max_terminated_length': 16246.0, 'rewards/accuracy_reward/mean': 0.203125, 'rewards/accuracy_reward/std': 0.40390563011169434, 'reward': 0.203125, 'reward_std': 0.2648528814315796, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.022831827402114868, 'sampling/sampling_logp_difference/max': 10.392937660217285, 'sampling/importance_sampling_ratio/min': 3.0648167012259364e-05, 'sampling/importance_sampling_ratio/mean': 0.9999691247940063, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.211572751402855, 'clip_ratio/low_mean': 4.4509632004974264e-05, 'clip_ratio/low_min': 9.218189916282427e-06, 'clip_ratio/high_mean': 1.198695827042684e-06, 'clip_ratio/high_max': 4.794783308170736e-06, 'clip_ratio/region_mean': 4.570832788886037e-05, 'epoch': 0.54}
+
+ 58%|█████▊    | 592/1024 [3:43:38<20:44:05, 172.79s/it][AINFO 12-02 21:19:15 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 21:19:15 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 21:19:15 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 21:19:15 [block_pool.py:292] Successfully reset prefix cache
+
+ 58%|█████▊    | 593/1024 [3:46:05<19:44:12, 164.85s/it][A
+                                                        [A{'loss': -0.0133, 'grad_norm': 0.003175790421664715, 'learning_rate': 1e-05, 'num_tokens': 502501991.0, 'completions/mean_length': 6133.4765625, 'completions/min_length': 693.0, 'completions/max_length': 15804.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 6133.4765625, 'completions/min_terminated_length': 693.0, 'completions/max_terminated_length': 15804.0, 'rewards/accuracy_reward/mean': 0.421875, 'rewards/accuracy_reward/std': 0.4957992732524872, 'reward': 0.421875, 'reward_std': 0.23356689512729645, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.019553154706954956, 'sampling/sampling_logp_difference/max': 8.55570125579834, 'sampling/importance_sampling_ratio/min': 0.00019244480063207448, 'sampling/importance_sampling_ratio/mean': 0.9999285340309143, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.928885281085968, 'clip_ratio/low_mean': 3.417267919303413e-05, 'clip_ratio/low_min': 5.46830551684252e-06, 'clip_ratio/high_mean': 6.605505063816963e-06, 'clip_ratio/high_max': 2.6422020255267853e-05, 'clip_ratio/region_mean': 4.077818414316425e-05, 'epoch': 0.55}
+
+ 58%|█████▊    | 593/1024 [3:46:05<19:44:12, 164.85s/it][AINFO 12-02 21:21:41 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 21:21:41 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 21:21:41 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 21:21:41 [block_pool.py:292] Successfully reset prefix cache
+
+ 58%|█████▊    | 594/1024 [3:49:15<20:36:00, 172.47s/it][A
+                                                        [A{'loss': 0.0033, 'grad_norm': 0.0015484488103538752, 'learning_rate': 1e-05, 'num_tokens': 503441463.0, 'completions/mean_length': 7191.8125, 'completions/min_length': 575.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 6971.2001953125, 'completions/min_terminated_length': 575.0, 'completions/max_terminated_length': 15269.0, 'rewards/accuracy_reward/mean': 0.3046875, 'rewards/accuracy_reward/std': 0.46208351850509644, 'reward': 0.3046875, 'reward_std': 0.11336849629878998, 'frac_reward_zero_std': 0.75, 'sampling/sampling_logp_difference/mean': 0.01987777091562748, 'sampling/sampling_logp_difference/max': 10.984257698059082, 'sampling/importance_sampling_ratio/min': 1.6966703697107732e-05, 'sampling/importance_sampling_ratio/mean': 1.000033974647522, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9686790779232979, 'clip_ratio/low_mean': 1.5187597455224022e-06, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 1.922943170029612e-06, 'clip_ratio/high_max': 7.691772680118447e-06, 'clip_ratio/region_mean': 3.441702915552014e-06, 'epoch': 0.55}
+
+ 58%|█████▊    | 594/1024 [3:49:15<20:36:00, 172.47s/it][AINFO 12-02 21:24:52 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 21:24:52 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 21:24:52 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 21:24:52 [block_pool.py:292] Successfully reset prefix cache
+
+ 58%|█████▊    | 595/1024 [3:52:16<20:50:37, 174.91s/it][A
+                                                        [A{'loss': 0.0603, 'grad_norm': 0.0018884538440033793, 'learning_rate': 1e-05, 'num_tokens': 504411486.0, 'completions/mean_length': 7433.2421875, 'completions/min_length': 458.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0390625, 'completions/mean_terminated_length': 7069.39013671875, 'completions/min_terminated_length': 458.0, 'completions/max_terminated_length': 16304.0, 'rewards/accuracy_reward/mean': 0.3359375, 'rewards/accuracy_reward/std': 0.47417303919792175, 'reward': 0.3359375, 'reward_std': 0.2301519364118576, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.019707927480340004, 'sampling/sampling_logp_difference/max': 5.621622085571289, 'sampling/importance_sampling_ratio/min': 0.0036187663208693266, 'sampling/importance_sampling_ratio/mean': 0.9999992251396179, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9139221832156181, 'clip_ratio/low_mean': 3.569773593881109e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 0.0, 'clip_ratio/high_max': 0.0, 'clip_ratio/region_mean': 3.569773593881109e-05, 'epoch': 0.55}
+
+ 58%|█████▊    | 595/1024 [3:52:16<20:50:37, 174.91s/it][AINFO 12-02 21:27:52 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 21:27:52 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 21:27:52 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 21:27:52 [block_pool.py:292] Successfully reset prefix cache
+
+ 58%|█████▊    | 596/1024 [3:55:08<20:42:11, 174.14s/it][A
+                                                        [A{'loss': 0.0703, 'grad_norm': 0.0015901189763098955, 'learning_rate': 1e-05, 'num_tokens': 505350293.0, 'completions/mean_length': 7196.9296875, 'completions/min_length': 2.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0625, 'completions/mean_terminated_length': 6584.45849609375, 'completions/min_terminated_length': 2.0, 'completions/max_terminated_length': 15373.0, 'rewards/accuracy_reward/mean': 0.5390625, 'rewards/accuracy_reward/std': 0.5004304051399231, 'reward': 0.5390625, 'reward_std': 0.27092626690864563, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.019207783043384552, 'sampling/sampling_logp_difference/max': 10.374948501586914, 'sampling/importance_sampling_ratio/min': 3.120449400739744e-05, 'sampling/importance_sampling_ratio/mean': 0.9998888969421387, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9032429754734039, 'clip_ratio/low_mean': 3.6911303482156654e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 4.715469003713224e-06, 'clip_ratio/high_max': 1.5194896604953101e-05, 'clip_ratio/region_mean': 4.1626772599556716e-05, 'epoch': 0.55}
+
+ 58%|█████▊    | 596/1024 [3:55:08<20:42:11, 174.14s/it][AINFO 12-02 21:30:45 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 21:30:45 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 21:30:45 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 21:30:45 [block_pool.py:292] Successfully reset prefix cache
+
+ 58%|█████▊    | 597/1024 [3:57:35<19:41:16, 165.99s/it][A
+                                                        [A{'loss': 0.0531, 'grad_norm': 0.0030207515228539705, 'learning_rate': 1e-05, 'num_tokens': 506037040.0, 'completions/mean_length': 5169.6484375, 'completions/min_length': 373.0, 'completions/max_length': 15727.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 5169.6484375, 'completions/min_terminated_length': 373.0, 'completions/max_terminated_length': 15727.0, 'rewards/accuracy_reward/mean': 0.296875, 'rewards/accuracy_reward/std': 0.45867621898651123, 'reward': 0.296875, 'reward_std': 0.3437528908252716, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.019817300140857697, 'sampling/sampling_logp_difference/max': 6.977166175842285, 'sampling/importance_sampling_ratio/min': 0.0009329432505182922, 'sampling/importance_sampling_ratio/mean': 1.0000035762786865, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.0305351540446281, 'clip_ratio/low_mean': 5.375093132897746e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 9.246356512448983e-06, 'clip_ratio/high_max': 3.280831606389256e-05, 'clip_ratio/region_mean': 6.299728784142644e-05, 'epoch': 0.55}
+
+ 58%|█████▊    | 597/1024 [3:57:35<19:41:16, 165.99s/it][AINFO 12-02 21:33:12 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 21:33:12 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 21:33:12 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 21:33:12 [block_pool.py:292] Successfully reset prefix cache
+
+ 58%|█████▊    | 598/1024 [4:00:06<19:06:44, 161.51s/it][A
+                                                        [A{'loss': 0.0026, 'grad_norm': 0.0015628430992364883, 'learning_rate': 1e-05, 'num_tokens': 506851816.0, 'completions/mean_length': 6172.6875, 'completions/min_length': 232.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 6092.283203125, 'completions/min_terminated_length': 232.0, 'completions/max_terminated_length': 14642.0, 'rewards/accuracy_reward/mean': 0.3203125, 'rewards/accuracy_reward/std': 0.4684300124645233, 'reward': 0.3203125, 'reward_std': 0.2580180764198303, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.021244537085294724, 'sampling/sampling_logp_difference/max': 9.809249877929688, 'sampling/importance_sampling_ratio/min': 5.494104334502481e-05, 'sampling/importance_sampling_ratio/mean': 0.9998974800109863, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.0597273036837578, 'clip_ratio/low_mean': 3.0260603125498164e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 3.806007327966654e-06, 'clip_ratio/high_max': 1.5224029311866616e-05, 'clip_ratio/region_mean': 3.4066610623995075e-05, 'epoch': 0.55}
+
+ 58%|█████▊    | 598/1024 [4:00:06<19:06:44, 161.51s/it][AINFO 12-02 21:35:43 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 21:35:43 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 21:35:43 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 21:35:43 [block_pool.py:292] Successfully reset prefix cache
+
+ 58%|█████▊    | 599/1024 [4:02:53<19:15:37, 163.15s/it][A
+                                                        [A{'loss': 0.139, 'grad_norm': 0.0027368192095309496, 'learning_rate': 1e-05, 'num_tokens': 507712066.0, 'completions/mean_length': 6521.640625, 'completions/min_length': 1111.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 6365.095703125, 'completions/min_terminated_length': 1111.0, 'completions/max_terminated_length': 15710.0, 'rewards/accuracy_reward/mean': 0.5390625, 'rewards/accuracy_reward/std': 0.5004304051399231, 'reward': 0.5390625, 'reward_std': 0.37009331583976746, 'frac_reward_zero_std': 0.1875, 'sampling/sampling_logp_difference/mean': 0.01946975290775299, 'sampling/sampling_logp_difference/max': 6.709050178527832, 'sampling/importance_sampling_ratio/min': 0.0012198220938444138, 'sampling/importance_sampling_ratio/mean': 1.0000547170639038, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.0038801655173302, 'clip_ratio/low_mean': 2.306892571368735e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 8.750191511808225e-06, 'clip_ratio/high_max': 3.15819158913655e-05, 'clip_ratio/region_mean': 3.1819117282338993e-05, 'epoch': 0.55}
+
+ 58%|█████▊    | 599/1024 [4:02:53<19:15:37, 163.15s/it][AINFO 12-02 21:38:30 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 21:38:30 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 21:38:30 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 21:38:30 [block_pool.py:292] Successfully reset prefix cache
+
+ 59%|█████▊    | 600/1024 [4:05:39<19:20:02, 164.16s/it][A
+                                                        [A{'loss': 0.0715, 'grad_norm': 0.001597574446350336, 'learning_rate': 1e-05, 'num_tokens': 508660727.0, 'completions/mean_length': 7261.1640625, 'completions/min_length': 1216.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0390625, 'completions/mean_terminated_length': 6890.31689453125, 'completions/min_terminated_length': 1216.0, 'completions/max_terminated_length': 15924.0, 'rewards/accuracy_reward/mean': 0.34375, 'rewards/accuracy_reward/std': 0.47682511806488037, 'reward': 0.34375, 'reward_std': 0.24435341358184814, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.018714535981416702, 'sampling/sampling_logp_difference/max': 6.283580303192139, 'sampling/importance_sampling_ratio/min': 0.0018667052499949932, 'sampling/importance_sampling_ratio/mean': 1.0000066757202148, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8913168013095856, 'clip_ratio/low_mean': 4.648681806429522e-05, 'clip_ratio/low_min': 4.722148787550395e-06, 'clip_ratio/high_mean': 1.221848606292042e-06, 'clip_ratio/high_max': 4.887394425168168e-06, 'clip_ratio/region_mean': 4.770866667058726e-05, 'epoch': 0.55}
+
+ 59%|█████▊    | 600/1024 [4:05:39<19:20:02, 164.16s/it][AINFO 12-02 21:41:16 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 21:41:16 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 21:41:16 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 21:41:16 [block_pool.py:292] Successfully reset prefix cache
+
+ 59%|█████▊    | 601/1024 [4:08:24<19:17:27, 164.18s/it][A
+                                                        [A{'loss': 0.0485, 'grad_norm': 0.0016042378265410662, 'learning_rate': 1e-05, 'num_tokens': 509569763.0, 'completions/mean_length': 6937.65625, 'completions/min_length': 719.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 6787.71484375, 'completions/min_terminated_length': 719.0, 'completions/max_terminated_length': 15540.0, 'rewards/accuracy_reward/mean': 0.328125, 'rewards/accuracy_reward/std': 0.4713755249977112, 'reward': 0.328125, 'reward_std': 0.2835350036621094, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.018897367641329765, 'sampling/sampling_logp_difference/max': 5.251621246337891, 'sampling/importance_sampling_ratio/min': 0.005239017773419619, 'sampling/importance_sampling_ratio/mean': 0.9999073147773743, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9242420196533203, 'clip_ratio/low_mean': 4.8732353320701804e-05, 'clip_ratio/low_min': 3.2816151360748336e-06, 'clip_ratio/high_mean': 1.1722753299636679e-05, 'clip_ratio/high_max': 3.683989120872866e-05, 'clip_ratio/region_mean': 6.045510644980823e-05, 'epoch': 0.55}
+
+ 59%|█████▊    | 601/1024 [4:08:24<19:17:27, 164.18s/it][AINFO 12-02 21:44:00 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 21:44:00 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 21:44:00 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 21:44:00 [block_pool.py:292] Successfully reset prefix cache
+
+ 59%|█████▉    | 602/1024 [4:11:20<19:39:48, 167.74s/it][A
+                                                        [A{'loss': 0.0324, 'grad_norm': 0.001268621999770403, 'learning_rate': 1e-05, 'num_tokens': 510422539.0, 'completions/mean_length': 6520.9375, 'completions/min_length': 99.0, 'completions/max_length': 15912.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 6520.9375, 'completions/min_terminated_length': 99.0, 'completions/max_terminated_length': 15912.0, 'rewards/accuracy_reward/mean': 0.5078125, 'rewards/accuracy_reward/std': 0.5019033551216125, 'reward': 0.5078125, 'reward_std': 0.30457615852355957, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.019956210628151894, 'sampling/sampling_logp_difference/max': 5.932888507843018, 'sampling/importance_sampling_ratio/min': 0.0026508141309022903, 'sampling/importance_sampling_ratio/mean': 0.999987006187439, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9955133497714996, 'clip_ratio/low_mean': 4.6636223032692214e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 2.8077905653844937e-06, 'clip_ratio/high_max': 1.1231162261537975e-05, 'clip_ratio/region_mean': 4.944401439388457e-05, 'epoch': 0.55}
+
+ 59%|█████▉    | 602/1024 [4:11:20<19:39:48, 167.74s/it][AINFO 12-02 21:46:56 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 21:46:56 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 21:46:56 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 21:46:56 [block_pool.py:292] Successfully reset prefix cache
+
+ 59%|█████▉    | 603/1024 [4:14:00<19:21:26, 165.53s/it][A
+                                                        [A{'loss': -0.0011, 'grad_norm': 0.002995383460074663, 'learning_rate': 1e-05, 'num_tokens': 511233622.0, 'completions/mean_length': 6166.8359375, 'completions/min_length': 546.0, 'completions/max_length': 15892.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 6166.8359375, 'completions/min_terminated_length': 546.0, 'completions/max_terminated_length': 15892.0, 'rewards/accuracy_reward/mean': 0.3828125, 'rewards/accuracy_reward/std': 0.4879830479621887, 'reward': 0.3828125, 'reward_std': 0.1633366346359253, 'frac_reward_zero_std': 0.625, 'sampling/sampling_logp_difference/mean': 0.0196513794362545, 'sampling/sampling_logp_difference/max': 5.518129348754883, 'sampling/importance_sampling_ratio/min': 0.004013348370790482, 'sampling/importance_sampling_ratio/mean': 0.9999934434890747, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.1260707154870033, 'clip_ratio/low_mean': 2.5951596285267442e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 2.6921659355139127e-06, 'clip_ratio/high_max': 1.076866374205565e-05, 'clip_ratio/region_mean': 2.8643762107094517e-05, 'epoch': 0.55}
+
+ 59%|█████▉    | 603/1024 [4:14:00<19:21:26, 165.53s/it][AINFO 12-02 21:49:37 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 21:49:37 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 21:49:37 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 21:49:37 [block_pool.py:292] Successfully reset prefix cache
+
+ 59%|█████▉    | 604/1024 [4:17:05<19:59:01, 171.29s/it][A
+                                                        [A{'loss': 0.1146, 'grad_norm': 0.002102211117744446, 'learning_rate': 1e-05, 'num_tokens': 512252707.0, 'completions/mean_length': 7813.1015625, 'completions/min_length': 967.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.046875, 'completions/mean_terminated_length': 7391.58154296875, 'completions/min_terminated_length': 967.0, 'completions/max_terminated_length': 16265.0, 'rewards/accuracy_reward/mean': 0.2734375, 'rewards/accuracy_reward/std': 0.447474867105484, 'reward': 0.2734375, 'reward_std': 0.32483339309692383, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.01952178217470646, 'sampling/sampling_logp_difference/max': 13.942327499389648, 'sampling/importance_sampling_ratio/min': 8.808949019112333e-07, 'sampling/importance_sampling_ratio/mean': 1.0000017881393433, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9835772663354874, 'clip_ratio/low_mean': 6.284466144279577e-05, 'clip_ratio/low_min': 1.958670782187255e-05, 'clip_ratio/high_mean': 2.000828430936963e-06, 'clip_ratio/high_max': 8.003313723747851e-06, 'clip_ratio/region_mean': 6.48454897600459e-05, 'epoch': 0.56}
+
+ 59%|█████▉    | 604/1024 [4:17:05<19:59:01, 171.29s/it][AINFO 12-02 21:52:41 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 21:52:41 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 21:52:41 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 21:52:41 [block_pool.py:292] Successfully reset prefix cache
+
+ 59%|█████▉    | 605/1024 [4:19:32<19:04:49, 163.94s/it][A
+                                                        [A{'loss': 0.0173, 'grad_norm': 0.0019877345766872168, 'learning_rate': 1e-05, 'num_tokens': 513072713.0, 'completions/mean_length': 6258.796875, 'completions/min_length': 700.0, 'completions/max_length': 15709.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 6258.796875, 'completions/min_terminated_length': 700.0, 'completions/max_terminated_length': 15709.0, 'rewards/accuracy_reward/mean': 0.4609375, 'rewards/accuracy_reward/std': 0.5004304051399231, 'reward': 0.4609375, 'reward_std': 0.23516449332237244, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.02054009959101677, 'sampling/sampling_logp_difference/max': 7.216361999511719, 'sampling/importance_sampling_ratio/min': 0.0007344695623032749, 'sampling/importance_sampling_ratio/mean': 0.9999274015426636, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.0968554988503456, 'clip_ratio/low_mean': 2.409284547866264e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 4.285791760594293e-06, 'clip_ratio/high_max': 1.7143167042377172e-05, 'clip_ratio/region_mean': 2.8378637239256932e-05, 'epoch': 0.56}
+
+ 59%|█████▉    | 605/1024 [4:19:32<19:04:49, 163.94s/it][AINFO 12-02 21:55:08 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 21:55:08 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 21:55:08 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 21:55:08 [block_pool.py:292] Successfully reset prefix cache
+
+ 59%|█████▉    | 606/1024 [4:22:33<19:38:27, 169.16s/it][A
+                                                        [A{'loss': 0.0468, 'grad_norm': 0.0010973098687827587, 'learning_rate': 1e-05, 'num_tokens': 514027698.0, 'completions/mean_length': 7305.0703125, 'completions/min_length': 1891.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 7233.58251953125, 'completions/min_terminated_length': 1891.0, 'completions/max_terminated_length': 16104.0, 'rewards/accuracy_reward/mean': 0.3515625, 'rewards/accuracy_reward/std': 0.4793342351913452, 'reward': 0.3515625, 'reward_std': 0.1883118748664856, 'frac_reward_zero_std': 0.625, 'sampling/sampling_logp_difference/mean': 0.01929587498307228, 'sampling/sampling_logp_difference/max': 5.893075942993164, 'sampling/importance_sampling_ratio/min': 0.002758478745818138, 'sampling/importance_sampling_ratio/mean': 0.9999312162399292, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8919311985373497, 'clip_ratio/low_mean': 2.8181228003631986e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 9.880172910925467e-07, 'clip_ratio/high_max': 3.952069164370187e-06, 'clip_ratio/region_mean': 2.9169245294724533e-05, 'epoch': 0.56}
+
+ 59%|█████▉    | 606/1024 [4:22:33<19:38:27, 169.16s/it][AINFO 12-02 21:58:10 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 21:58:10 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 21:58:10 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 21:58:10 [block_pool.py:292] Successfully reset prefix cache
+
+ 59%|█████▉    | 607/1024 [4:25:42<20:17:02, 175.11s/it][A
+                                                        [A{'loss': 0.0333, 'grad_norm': 0.0011414727196097374, 'learning_rate': 1e-05, 'num_tokens': 514974087.0, 'completions/mean_length': 7244.7890625, 'completions/min_length': 1198.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0390625, 'completions/mean_terminated_length': 6873.2763671875, 'completions/min_terminated_length': 1198.0, 'completions/max_terminated_length': 16238.0, 'rewards/accuracy_reward/mean': 0.4765625, 'rewards/accuracy_reward/std': 0.5014128684997559, 'reward': 0.4765625, 'reward_std': 0.27434611320495605, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.018441129475831985, 'sampling/sampling_logp_difference/max': 8.622167587280273, 'sampling/importance_sampling_ratio/min': 0.00018006951722782105, 'sampling/importance_sampling_ratio/mean': 0.9998536109924316, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.819145604968071, 'clip_ratio/low_mean': 4.132410458623781e-05, 'clip_ratio/low_min': 4.114007424504962e-06, 'clip_ratio/high_mean': 7.019159511401085e-07, 'clip_ratio/high_max': 2.807663804560434e-06, 'clip_ratio/region_mean': 4.202602053737792e-05, 'epoch': 0.56}
+
+ 59%|█████▉    | 607/1024 [4:25:42<20:17:02, 175.11s/it][AINFO 12-02 22:01:19 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 22:01:19 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 22:01:19 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 22:01:19 [block_pool.py:292] Successfully reset prefix cache
+
+ 59%|█████▉    | 608/1024 [4:28:41<20:21:42, 176.21s/it][A
+                                                        [A{'loss': 0.1056, 'grad_norm': 0.0021221793722361326, 'learning_rate': 1e-05, 'num_tokens': 515820929.0, 'completions/mean_length': 6446.578125, 'completions/min_length': 368.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 6288.841796875, 'completions/min_terminated_length': 368.0, 'completions/max_terminated_length': 15763.0, 'rewards/accuracy_reward/mean': 0.5, 'rewards/accuracy_reward/std': 0.5019646286964417, 'reward': 0.5, 'reward_std': 0.30168038606643677, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.019936196506023407, 'sampling/sampling_logp_difference/max': 9.586402893066406, 'sampling/importance_sampling_ratio/min': 6.865594332339242e-05, 'sampling/importance_sampling_ratio/mean': 0.9999166131019592, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.0150432661175728, 'clip_ratio/low_mean': 4.5529439148594975e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 9.348170806333655e-07, 'clip_ratio/high_max': 3.739268322533462e-06, 'clip_ratio/region_mean': 4.6464256456602016e-05, 'epoch': 0.56}
+
+ 59%|█████▉    | 608/1024 [4:28:41<20:21:42, 176.21s/it][AINFO 12-02 22:04:17 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 22:04:17 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 22:04:17 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 22:04:17 [block_pool.py:292] Successfully reset prefix cache
diff --git a/grpo_lora_20251130_192918/checkpoint-704/adapter_config.json b/grpo_lora_20251130_192918/checkpoint-704/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..da884a8eb3c02a42d08fe869da98a8ad4366197d
--- /dev/null
+++ b/grpo_lora_20251130_192918/checkpoint-704/adapter_config.json
@@ -0,0 +1,42 @@
+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B",
+  "bias": "none",
+  "corda_config": null,
+  "eva_config": null,
+  "exclude_modules": null,
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 32,
+  "lora_bias": false,
+  "lora_dropout": 0.05,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "qalora_group_size": 16,
+  "r": 16,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "v_proj",
+    "o_proj",
+    "down_proj",
+    "up_proj",
+    "k_proj",
+    "gate_proj",
+    "q_proj"
+  ],
+  "target_parameters": null,
+  "task_type": "CAUSAL_LM",
+  "trainable_token_indices": null,
+  "use_dora": false,
+  "use_qalora": false,
+  "use_rslora": false
+}
\ No newline at end of file